修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
13 changed files
--- a/mamba_ssm/ops/triton/ssd_chunk_scan.py
+++ b/mamba_ssm/ops/triton/ssd_chunk_scan.py
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+
+"""We want triton==2.1.0 or 2.2.0 for this
+"""
+
+import math
+from packaging import version
+
+import torch
+import torch.nn.functional as F
+
+import triton
+import triton.language as tl
+
+from einops import rearrange, repeat
+
+from mamba_ssm.ops.triton.ssd_bmm import _bmm_chunk_fwd, _bmm_chunk_bwd
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+
+
+def init_to_zero(names):
+    return lambda nargs: [nargs[name].zero_() for name in names if nargs[name] is not None]
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=2),
+    ],
+    key=['chunk_size', 'hdim', 'dstate', 'IS_CAUSAL'],
+)
+@triton.jit
+def _chunk_scan_fwd_kernel(
+    # Pointers to matrices
+    cb_ptr, x_ptr, z_ptr, out_ptr, out_x_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr, C_ptr, prev_states_ptr, D_ptr,
+    # Matrix dimensions
+    chunk_size, hdim, dstate,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_k,
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_z_batch, stride_z_seqlen, stride_z_head, stride_z_hdim,
+    stride_out_batch, stride_out_seqlen, stride_out_head, stride_out_hdim,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_C_batch, stride_C_seqlen, stride_C_head, stride_C_dstate,
+    stride_states_batch, stride_states_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
+    stride_D_head,
+    # Meta-parameters
+    IS_CAUSAL: tl.constexpr,
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+    IS_TRITON_22: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    C_ptr += pid_b * stride_C_batch + pid_c * chunk_size * stride_C_seqlen + (pid_h // nheads_ngroups_ratio) * stride_C_head
+    prev_states_ptr += pid_b * stride_states_batch + pid_c * stride_states_chunk + pid_h * stride_states_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen, mask=pid_c >= 1, other=0)
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # Without the if (pid_c > -1), with Triton 2.1.0, I get
+    # Assertion `!(srcMmaLayout && dstMmaLayout) && "Unexpected mma -> mm a layout conversion"' failed.
+    # With Triton 2.2.0, this works
+    if IS_TRITON_22 or pid_c > -1:
+        # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+        offs_k_dstate = tl.arange(0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
+        C_ptrs = C_ptr + (offs_m[:, None] * stride_C_seqlen + offs_k_dstate[None, :] * stride_C_dstate)
+        prev_states_ptrs = prev_states_ptr + (offs_n[None, :] * stride_states_hdim + offs_k_dstate[:, None] * stride_states_dstate)
+        if not HAS_SEQ_IDX:
+            scale_m = tl.exp(dA_cs_m)
+        else:
+            scale_m = tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m), 0.0)
+        if BLOCK_SIZE_DSTATE <= 128:
+            C = tl.load(C_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k_dstate[None, :] < dstate), other=0.0)
+            prev_states = tl.load(prev_states_ptrs, mask=(offs_k_dstate[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
+            prev_states = prev_states.to(C_ptr.dtype.element_ty)
+            acc = tl.dot(C, prev_states) * scale_m[:, None]
+        else:
+            for k in range(0, dstate, BLOCK_SIZE_K):
+                C = tl.load(C_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k_dstate[None, :] < dstate - k), other=0.0)
+                # C = (C * scale_m[:, None]).to(C_ptr.dtype.element_ty)
+                prev_states = tl.load(prev_states_ptrs, mask=(offs_k_dstate[:, None] < dstate - k) & (offs_n[None, :] < hdim), other=0.0)
+                prev_states = prev_states.to(C_ptr.dtype.element_ty)
+                acc += tl.dot(C, prev_states)
+                C_ptrs += BLOCK_SIZE_K
+                prev_states_ptrs += BLOCK_SIZE_K
+            acc *= scale_m[:, None]
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k)
+    x_ptrs = x_ptr + (offs_k[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    K_MAX = chunk_size_limit if not IS_CAUSAL else min((pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit)
+    for k in range(0, K_MAX, BLOCK_SIZE_K):
+        cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < chunk_size - k), other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(tl.float32)
+        # If there's seq_idx, we already set cb[i, j] = 0 for seq_idx[i] != seq_idx[j].
+        # So we don't need masking wrt seq_idx here.
+        cb *= tl.exp((dA_cs_m[:, None] - dA_cs_k[None, :]))
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(tl.float32)
+        cb *= dt_k
+        if IS_CAUSAL:
+            mask = offs_m[:, None] >= k + offs_k[None, :]
+            cb = tl.where(mask, cb, 0.0)
+        cb = cb.to(x_ptr.dtype.element_ty)
+        x = tl.load(x_ptrs, mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < hdim), other=0.0)
+        acc += tl.dot(cb, x)
+        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    offs_out_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    if HAS_D:
+        if D_HAS_HDIM:
+            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
+        else:
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        x_residual = tl.load(x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim),
+                             mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+        acc += x_residual * D
+
+    if HAS_Z:
+        out_x_ptr += pid_b * stride_out_batch + pid_c * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+        out_x_ptrs = out_x_ptr + (stride_out_seqlen * offs_out_m[:, None] + offs_out_n[None, :])
+        tl.store(out_x_ptrs, acc, mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim))
+
+        z_ptr += pid_b * stride_z_batch + pid_c * chunk_size * stride_z_seqlen + pid_h * stride_z_head
+        z_ptrs = z_ptr + (stride_z_seqlen * offs_out_m[:, None] + stride_z_hdim * offs_out_n[None, :])
+        z = tl.load(z_ptrs, mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim), other=0.0).to(tl.float32)
+        acc *= z * tl.sigmoid(z)
+
+    out_ptr += pid_b * stride_out_batch + pid_c * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+    out_ptrs = out_ptr + (stride_out_seqlen * offs_out_m[:, None] + offs_out_n[None, :] * stride_out_hdim)
+    tl.store(out_ptrs, acc, mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim))
+
+
+@triton.autotune(
+    configs=[
+        # triton.Config({'BLOCK_SIZE_N': 256}, num_stages=4, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_N': 128}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_N': 64}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_N': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_N': 64}, num_stages=4, num_warps=8),
+        triton.Config({'BLOCK_SIZE_N': 32}, num_stages=4, num_warps=8),
+    ],
+    key=['chunk_size', 'hdim', 'dstate'],
+)
+@triton.jit
+def _chunk_scan_fwd_kernel_wip(
+    # Pointers to matrices
+    cb_ptr, x_ptr, z_ptr, out_ptr, out_x_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr, C_ptr, B_ptr, prev_states_ptr, D_ptr,
+    # Matrix dimensions
+    chunk_size, hdim, dstate,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_k,
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_z_batch, stride_z_seqlen, stride_z_head, stride_z_hdim,
+    stride_out_batch, stride_out_seqlen, stride_out_head, stride_out_hdim,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_C_batch, stride_C_seqlen, stride_C_head, stride_C_dstate,
+    stride_B_batch, stride_B_seqlen, stride_B_head, stride_B_dstate,
+    stride_states_batch, stride_states_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
+    stride_D_head,
+    # Meta-parameters
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    pid_n = tl.program_id(axis=0)
+    cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    C_ptr += pid_b * stride_C_batch + pid_c * chunk_size * stride_C_seqlen + (pid_h // nheads_ngroups_ratio) * stride_C_head
+    B_ptr += pid_b * stride_B_batch + pid_c * chunk_size * stride_B_seqlen + (pid_h // nheads_ngroups_ratio) * stride_B_head
+    prev_states_ptr += pid_b * stride_states_batch + pid_c * stride_states_chunk + pid_h * stride_states_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+    out_ptr += pid_b * stride_out_batch + pid_c * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+
+    offs_m = tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k_dstate = tl.arange(0, BLOCK_SIZE_DSTATE)
+
+    C_ptrs = C_ptr + (offs_m[:, None] * stride_C_seqlen + offs_k_dstate[None, :] * stride_C_dstate)
+    B_ptrs = B_ptr + (offs_m[None, :] * stride_B_seqlen + offs_k_dstate[:, None] * stride_B_dstate)
+    prev_states_ptrs = prev_states_ptr + (offs_n[None, :] * stride_states_hdim + offs_k_dstate[:, None] * stride_states_dstate)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_m[None, :] * stride_cb_csize_k)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
+    out_ptrs = out_ptr + (offs_m[:, None] * stride_out_seqlen + offs_n[None, :] * stride_out_hdim)
+
+    prev_states = tl.load(prev_states_ptrs, mask=(offs_k_dstate[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
+    # if pid_c == 0:
+    #     if pid_b == 0:
+    #         if pid_h == 0:
+    #             tl.device_print("", prev_states)
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    # dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    # scale_m = tl.exp(dA_cs_m)
+    # C = tl.load(C_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k_dstate[None, :] < dstate), other=0.0)
+    # acc = tl.dot(C, prev_states.to(C_ptr.dtype.element_ty)) * scale_m[:, None]
+    # cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_m[None, :] < chunk_size), other=0.0).to(tl.float32)
+    # cb *= tl.exp((dA_cs_m[:, None] - dA_cs_m[None, :]))
+    # dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    # cb *= dt_m
+    # mask = offs_m[:, None] >= offs_m[None, :]
+    # cb = tl.where(mask, cb, 0.0)
+    # cb = cb.to(x_ptr.dtype.element_ty)
+    # x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0)
+    # acc += tl.dot(cb, x)
+    # if HAS_D:
+    #     if D_HAS_HDIM:
+    #         D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
+    #     else:
+    #         D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+    #     acc += x.to(tl.float32) * D
+    # tl.store(out_ptrs, acc, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
+
+    for start_m in range(0, chunk_size_limit, BLOCK_SIZE_M):
+        start_m = tl.multiple_of(start_m, BLOCK_SIZE_M)
+        dA_cs_m = tl.load(dA_cumsum_ptr + (start_m + offs_m) * stride_dA_cs_csize, mask=offs_m < chunk_size - start_m, other=0.0).to(tl.float32)
+        if HAS_SEQ_IDX:
+            seq_idx_prev = tl.load(seq_idx_ptr + start_m - stride_seq_idx_seqlen, mask=pid_c >= 1, other=0)
+            seq_idx_m = tl.load(seq_idx_ptr + (start_m + offs_m) * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit - start_m, other=-1)
+        if not HAS_SEQ_IDX:
+            scale_m = tl.exp(dA_cs_m)
+        else:
+            scale_m = tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m), 0.0)
+        C = tl.load(C_ptrs, mask=(offs_m[:, None] < chunk_size_limit - start_m) & (offs_k_dstate[None, :] < dstate), other=0.0)
+        acc = tl.dot(C, prev_states.to(C_ptr.dtype.element_ty)) * scale_m[:, None]
+        # cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size - start_m) & (offs_m[None, :] < chunk_size - start_m), other=0.0).to(tl.float32)
+        # cb *= tl.exp((dA_cs_m[:, None] - dA_cs_m[None, :]))
+        dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size - start_m, other=0.0).to(tl.float32)
+        # cb *= dt_m
+        # mask = offs_m[:, None] >= offs_m[None, :]
+        # cb = tl.where(mask, cb, 0.0)
+        # cb = cb.to(x_ptr.dtype.element_ty)
+        x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit - start_m) & (offs_n[None, :] < hdim), other=0.0)
+        # acc += tl.dot(cb, x)
+
+        if HAS_D:
+            if D_HAS_HDIM:
+                D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
+            else:
+                D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+            acc += x.to(tl.float32) * D
+
+        # if HAS_Z:
+        #     out_x_ptr += pid_b * stride_out_batch + pid_c * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+        #     out_x_ptrs = out_x_ptr + (stride_out_seqlen * offs_out_m[:, None] + offs_out_n[None, :])
+        #     tl.store(out_x_ptrs, acc, mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim))
+
+        #     z_ptr += pid_b * stride_z_batch + pid_c * chunk_size * stride_z_seqlen + pid_h * stride_z_head
+        #     z_ptrs = z_ptr + (stride_z_seqlen * offs_out_m[:, None] + stride_z_hdim * offs_out_n[None, :])
+        #     z = tl.load(z_ptrs, mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim), other=0.0).to(tl.float32)
+        #     acc *= z * tl.sigmoid(z)
+
+        tl.store(out_ptrs, acc, mask=(offs_m[:, None] < chunk_size_limit - start_m) & (offs_n[None, :] < hdim))
+
+        # TODO: this is not correct, and quite a bit slower
+        if start_m + BLOCK_SIZE_M < chunk_size_limit:
+            # B = tl.load(B_ptrs, mask=(offs_m[None, :] < chunk_size_limit - start_m) & (offs_k_dstate[:, None] < dstate), other=0.0).to(tl.float32)
+            B = tl.load(B_ptrs, mask=(offs_m[None, :] < chunk_size_limit - start_m) & (offs_k_dstate[:, None] < dstate), other=0.0)
+            dA_cs_last = tl.load(dA_cumsum_ptr + (start_m + BLOCK_SIZE_M) * stride_dA_cs_csize).to(tl.float32)
+            # TODO: seq_idx
+            scale = tl.exp((dA_cs_last - dA_cs_m)) * dt_m
+            # B *= scale
+            B = B.to(x_ptr.dtype.element_ty)
+            tmp = tl.dot(B, x)
+            prev_states += tmp.to(prev_states.dtype)
+
+        C_ptrs += BLOCK_SIZE_M * stride_C_seqlen
+        B_ptrs += BLOCK_SIZE_M * stride_B_seqlen
+        cb_ptrs += BLOCK_SIZE_M * stride_cb_csize_m + BLOCK_SIZE_M * stride_cb_csize_k
+        x_ptrs += BLOCK_SIZE_M * stride_x_seqlen
+        dt_ptrs += BLOCK_SIZE_M * stride_dt_csize
+        out_ptrs += BLOCK_SIZE_M * stride_out_seqlen
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32}),
+        triton.Config({'BLOCK_SIZE_M': 64}),
+        triton.Config({'BLOCK_SIZE_M': 128}),
+        triton.Config({'BLOCK_SIZE_M': 256}),
+    ],
+    key=["chunk_size", "hdim"],
+)
+@triton.jit
+def _chunk_scan_bwd_dz_kernel(
+    # Pointers to matrices
+    dout_ptr, out_ptr, z_ptr, x_ptr, D_ptr, outz_ptr, dz_ptr, dout_x_ptr, dD_ptr, ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, hdim,
+    batch, seqlen,
+    # Strides
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_out_batch, stride_out_seqlen, stride_out_head, stride_out_hdim,
+    stride_z_batch, stride_z_seqlen, stride_z_head, stride_z_hdim,
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_D_head,
+    stride_outz_batch, stride_outz_seqlen, stride_outz_head, stride_outz_hdim,
+    stride_dz_batch, stride_dz_seqlen, stride_dz_head, stride_dz_hdim,
+    stride_doutx_batch, stride_doutx_seqlen, stride_doutx_head, stride_doutx_hdim,
+    stride_dD_batch, stride_dD_chunk, stride_dD_head, stride_dD_csize, stride_dD_hdim,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
+    # Meta-parameters
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    HAS_DDACS: tl.constexpr,
+    RECOMPUTE_OUTPUT: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    dout_x_ptr += pid_b * stride_doutx_batch + pid_c * chunk_size * stride_doutx_seqlen + pid_h * stride_doutx_head
+    out_ptr += pid_b * stride_out_batch + pid_c * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+    z_ptr += pid_b * stride_z_batch + pid_c * chunk_size * stride_z_seqlen + pid_h * stride_z_head
+    dz_ptr += pid_b * stride_dz_batch + pid_c * chunk_size * stride_dz_seqlen + pid_h * stride_dz_head
+    if RECOMPUTE_OUTPUT:
+        outz_ptr += pid_b * stride_outz_batch + pid_c * chunk_size * stride_outz_seqlen + pid_h * stride_outz_head
+    if HAS_DDACS:
+        ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head
+    if HAS_D:
+        x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+        dD_ptr += pid_b * stride_dD_batch + pid_c * stride_dD_chunk + pid_h * stride_dD_head + pid_m * stride_dD_csize
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = tl.arange(0, BLOCK_SIZE_N)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
+    dout_x_ptrs = dout_x_ptr + (offs_m[:, None] * stride_doutx_seqlen + offs_n[None, :] * stride_doutx_hdim)
+    out_ptrs = out_ptr + (offs_m[:, None] * stride_out_seqlen + offs_n[None, :] * stride_out_hdim)
+    z_ptrs = z_ptr + (offs_m[:, None] * stride_z_seqlen + offs_n[None, :] * stride_z_hdim)
+    dz_ptrs = dz_ptr + (offs_m[:, None] * stride_dz_seqlen + offs_n[None, :] * stride_dz_hdim)
+    if RECOMPUTE_OUTPUT:
+        outz_ptrs = outz_ptr + (offs_m[:, None] * stride_outz_seqlen + offs_n[None, :] * stride_outz_hdim)
+    if HAS_D:
+        x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+        if D_HAS_HDIM:
+            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    out = tl.load(out_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    z = tl.load(z_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    z_sigmoid = tl.sigmoid(z)
+    if RECOMPUTE_OUTPUT:
+        outz = out * z * z_sigmoid
+        tl.store(outz_ptrs, outz, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
+    dz = dout * out * z_sigmoid * (1 + z * (1 - z_sigmoid))
+    tl.store(dz_ptrs, dz, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
+    dout *= z * z_sigmoid
+    tl.store(dout_x_ptrs, dout, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
+    if HAS_D:
+        x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+        if D_HAS_HDIM:
+            dD = tl.sum(dout * x, axis=0)
+            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
+            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
+        else:
+            dD = tl.sum(dout * x)
+            tl.store(dD_ptr, dD)
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        out -= x * D
+    if HAS_DDACS:
+        ddA_cs = tl.sum(dout * out, axis=1)
+        tl.store(ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=2),
+    ],
+    key=['hdim', 'dstate', 'chunk_size'],
+)
+@triton.jit
+def _chunk_scan_bwd_dstates_kernel(
+    # Pointers to matrices
+    dout_ptr, c_ptr, dprev_states_ptr, dA_cumsum_ptr, seq_idx_ptr,
+    # Matrix dimensions
+    hdim, dstate, chunk_size,
+    batch, seqlen, nchunks, nheads_ngroups_ratio,
+    # Strides
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_c_batch, stride_c_seqlen, stride_c_head, stride_c_dstate,
+    stride_dprev_states_batch, stride_dprev_states_chunk, stride_dprev_states_head, stride_dprev_states_hdim, stride_dprev_states_dstate,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    c_ptr += pid_b * stride_c_batch + pid_c * chunk_size * stride_c_seqlen + (pid_h // nheads_ngroups_ratio) * stride_c_head
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_hdim + offs_k[None, :] * stride_dout_seqlen)
+    c_ptrs = c_ptr + (offs_n[None, :] * stride_c_dstate + offs_k[:, None] * stride_c_seqlen)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    if HAS_SEQ_IDX:
+        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    if HAS_SEQ_IDX:
+        seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen, mask=pid_c >= 1, other=0)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k), other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(tl.float32)
+        if not HAS_SEQ_IDX:
+            scale_k = tl.exp(dA_cs_k)
+        else:
+            seq_idx_k = tl.load(seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1)
+            scale_k = tl.where(seq_idx_k == seq_idx_prev, tl.exp(dA_cs_k), 0.0)
+        dout = (dout * scale_k).to(dout_ptr.dtype.element_ty)
+        c = tl.load(c_ptrs, mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate), other=0.0)
+        acc += tl.dot(dout, c)
+        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
+        c_ptrs += BLOCK_SIZE_K * stride_c_seqlen
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+        if HAS_SEQ_IDX:
+            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
+    out = acc.to(dprev_states_ptr.dtype.element_ty)
+
+    dprev_states_ptr += pid_b * stride_dprev_states_batch + pid_c * stride_dprev_states_chunk + pid_h * stride_dprev_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dprev_states_ptrs = dprev_states_ptr + (offs_m[:, None] * stride_dprev_states_hdim + offs_n[None, :] * stride_dprev_states_dstate)
+    tl.store(dprev_states_ptrs, out, mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+    ],
+    key=['chunk_size', 'dstate', 'hdim'],
+)
+@triton.jit
+def _chunk_scan_bwd_dc_kernel(
+    # Pointers to matrices
+    dout_ptr, prev_states_ptr, C_ptr, dA_cumsum_ptr, seq_idx_ptr,
+    dc_ptr, ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, dstate, hdim,
+    batch, seqlen, nheads, nheads_per_program, ngroups,
+    # Strides
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_prev_states_batch, stride_prev_states_chunk, stride_prev_states_head, stride_prev_states_hdim, stride_prev_states_dstate,
+    stride_C_batch, stride_C_seqlen, stride_C_head, stride_C_dstate,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_dc_batch, stride_dc_seqlen, stride_dc_split, stride_dc_group, stride_dc_dstate,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
+    # Meta-parameters
+    HAS_DDA_CS: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_sg = tl.program_id(axis=2)
+    pid_s = pid_sg // ngroups
+    pid_g = pid_sg - pid_s * ngroups
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dout_head
+    dc_ptr += pid_b * stride_dc_batch + pid_c * chunk_size * stride_dc_seqlen + pid_g * stride_dc_group + pid_s * stride_dc_split
+    prev_states_ptr += pid_b * stride_prev_states_batch + pid_c * stride_prev_states_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_prev_states_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
+    if HAS_DDA_CS:
+        C_ptr += pid_b * stride_C_batch + pid_c * chunk_size * stride_C_seqlen + pid_g * stride_C_head
+        ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_ddA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_k[None, :] * stride_dout_hdim)
+    prev_states_ptrs = prev_states_ptr + (offs_n[None, :] * stride_prev_states_dstate + offs_k[:, None] * stride_prev_states_hdim)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
+    if HAS_DDA_CS:
+        C_ptrs = C_ptr + (offs_m[:, None] * stride_C_seqlen + offs_n[None, :] * stride_C_dstate)
+        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    if HAS_DDA_CS:
+        c = tl.load(C_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
+    if HAS_SEQ_IDX:
+        seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen, mask=pid_c >= 1, other=0)
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+    nheads_iter = min(nheads_per_program, nheads // ngroups - pid_s * nheads_per_program)
+    for h in range(nheads_iter):
+        dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
+        prev_states = tl.load(prev_states_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate), other=0.0)
+        prev_states = prev_states.to(dout_ptrs.dtype.element_ty)
+        dc = tl.dot(dout, prev_states)
+        dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+        if not HAS_SEQ_IDX:
+            scale = tl.exp(dA_cs_m)
+        else:
+            scale = tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m), 0.0)
+        dc *= scale[:, None]
+        if HAS_DDA_CS:
+            ddA_cs = tl.sum(dc * c, axis=1)
+            tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
+        acc += dc
+        dout_ptrs += stride_dout_head
+        prev_states_ptrs += stride_prev_states_head
+        dA_cumsum_ptrs += stride_dA_cs_head
+        if HAS_DDA_CS:
+            ddA_cumsum_ptrs += stride_ddA_cs_head
+    # if HAS_SEQ_IDX:
+    #     seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen, mask=pid_c >= 1, other=0)
+    #     seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+    #     acc = tl.where(seq_idx_m[:, None] == seq_idx_prev, acc, 0.0)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dc_ptrs = dc_ptr + (offs_m[:, None] * stride_dc_seqlen + offs_n[None, :] * stride_dc_dstate)
+    tl.store(dc_ptrs, acc, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+    ],
+    key=['chunk_size', 'hdim'],
+)
+@triton.jit
+def _chunk_scan_bwd_dx_kernel(
+    # Pointers to matrices
+    x_ptr, cb_ptr, dout_ptr, dt_ptr, dA_cumsum_ptr, D_ptr,
+    dx_ptr, ddt_ptr, # dD_ptr,
+    # Matrix dimensions
+    chunk_size, hdim,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_k,
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_D_head,
+    stride_dx_batch, stride_dx_seqlen, stride_dx_head, stride_dx_hdim,
+    stride_ddt_batch, stride_ddt_chunk, stride_ddt_head, stride_ddt_csize,
+    # stride_dD_batch, stride_dD_chunk, stride_dD_head, stride_dD_hdim, stride_dD_csize,
+    # Meta-parameters
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    ddt_ptr += pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    # if HAS_D:
+    #     dD_ptr += pid_b * stride_dD_batch + pid_c * stride_dD_chunk + pid_h * stride_dD_head + pid_m * stride_dD_csize
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k)
+    dout_ptrs = dout_ptr + (offs_k[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # Idk why limiting K_MAX gives wrong results, is it a Triton bug?
+    # K_MAX = min((pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit)
+    K_MAX = chunk_size_limit
+    for k in range(0, K_MAX, BLOCK_SIZE_K):
+        # For some reason setting mask to (offs_m[:, None] < chunk_size_limit) is much slower
+        cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < K_MAX - k), other=0.0)
+        dout = tl.load(dout_ptrs, mask=(offs_k[:, None] < K_MAX - k) & (offs_n[None, :] < hdim), other=0.0)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < K_MAX - k, other=0.0).to(tl.float32)
+        cb *= tl.exp(dA_cs_k[None, :] - dA_cs_m[:, None])
+        # If we don't have the (k + offs_k[None, :] < K_MAX) mask, for indices outside this range,
+        # we might have dA_cs_m = 0.0 and dA_cs_k very negative, and tl.exp will return inf.
+        # Multiplying with cb, which is 0.0 outside the range, will make the result NaN.
+        # This will cause NaN in acc, and hence NaN in dx and ddt.
+        mask = (k + offs_k[None, :] >= offs_m[:, None]) & (k + offs_k[None, :] < K_MAX)
+        cb = tl.where(mask, cb, 0.0)
+        cb = cb.to(dout_ptr.dtype.element_ty)
+        acc += tl.dot(cb, dout)
+        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
+        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
+    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+    dx = acc * dt_m[:, None]
+    dx_ptr += pid_b * stride_dx_batch + pid_c * chunk_size * stride_dx_seqlen + pid_h * stride_dx_head
+    dx_ptrs = dx_ptr + (offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim)
+    if HAS_D:
+        dout_res_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
+        dout_res = tl.load(dout_res_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+        if D_HAS_HDIM:
+            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
+        else:
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        dx += dout_res * D
+    tl.store(dx_ptrs, dx, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
+
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+    x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    ddt = tl.sum(acc * x, axis=1)
+    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
+    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
+
+    # if HAS_D:
+    #     dout_new_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_csize + offs_n[None, :] * stride_dout_hdim)
+    #     dout = tl.load(dout_new_ptrs, mask=(offs_m[:, None] < M) & (offs_n[None, :] < N), other=0.0).to(tl.float32)
+    #     dD = tl.sum(x * dout, axis=0)
+    #     tl.store(dD_ptr + offs_n * stride_dD_hdim, dD, mask=offs_n < N)
+
+
+# Disabling HAS_DDA_CS for now since it's much slower
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 16}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 32}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 64}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 128}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 16}, num_stages=4, num_warps=8),
+        # triton.Config({'BLOCK_SIZE_M': 32}, num_stages=4, num_warps=8),
+        # triton.Config({'BLOCK_SIZE_M': 64}, num_stages=4, num_warps=8),
+        # triton.Config({'BLOCK_SIZE_M': 128}, num_stages=4, num_warps=8),
+    ],
+    key=['chunk_size', 'hdim'],
+)
+# @triton.heuristics({"BLOCK_SIZE_N": lambda args: max(triton.next_power_of_2(args["chunk_size"]), 16)})
+# @triton.heuristics({"BLOCK_SIZE_N": lambda args: 32})
+@triton.jit
+def _chunk_scan_bwd_dcb_kernel(
+    # Pointers to matrices
+    x_ptr, dout_ptr, cb_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr,
+    dcb_ptr, ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, hdim,
+    batch, seqlen, nheads, nheads_per_program, ngroups,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_n,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_dcb_batch, stride_dcb_chunk, stride_dcb_split, stride_dcb_group, stride_dcb_csize_m, stride_dcb_csize_n,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize_m, stride_ddA_cs_csize_n,
+    # Meta-parameters
+    HAS_DDA_CS: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_sg = tl.program_id(axis=2)
+    pid_s = pid_sg // ngroups
+    pid_g = pid_sg - pid_s * ngroups
+    num_pid_n = tl.cdiv(chunk_size, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_x_head
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dout_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
+    if HAS_DDA_CS:
+        cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + pid_g * stride_cb_head
+        ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_ddA_cs_head + pid_m * stride_ddA_cs_csize_m
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_k[None, :] * stride_dout_hdim)
+    x_ptrs = x_ptr + (offs_n[None, :] * stride_x_seqlen + offs_k[:, None] * stride_x_hdim)
+    dt_ptrs = dt_ptr + offs_n * stride_dt_csize
+    if HAS_DDA_CS:
+        cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_n[None, :] * stride_cb_csize_n)
+        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_n * stride_ddA_cs_csize_n
+
+    if pid_n * BLOCK_SIZE_N >= (pid_m + 1) * BLOCK_SIZE_M:
+        dcb_ptr += pid_b * stride_dcb_batch + pid_c * stride_dcb_chunk + pid_g * stride_dcb_group + pid_s * stride_dcb_split
+        dcb_ptrs = dcb_ptr + (offs_m[:, None] * stride_dcb_csize_m + offs_n[None, :] * stride_dcb_csize_n)
+        tl.store(dcb_ptrs, tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=dcb_ptr.dtype.element_ty), mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size))
+        return
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    chunk_size_limit_n = min(chunk_size_limit, (pid_m + 1) * BLOCK_SIZE_M)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    if HAS_DDA_CS:
+        cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size), other=0.0).to(tl.float32)
+    nheads_iter = min(nheads_per_program, nheads // ngroups - pid_s * nheads_per_program)
+    for h in range(nheads_iter):
+        dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
+        x = tl.load(x_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < chunk_size_limit_n), other=0.0)
+        dcb = tl.dot(dout, x)
+        dt_n = tl.load(dt_ptrs, mask=offs_n < chunk_size, other=0.0).to(tl.float32)
+        dcb *= dt_n
+        dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+        dA_cs_n = tl.load(dA_cumsum_ptr + offs_n * stride_dA_cs_csize, mask=offs_n < chunk_size_limit, other=0.0).to(tl.float32)
+        dcb *= tl.exp(dA_cs_m[:, None] - dA_cs_n[None, :])
+        if HAS_DDA_CS:
+            tl.static_assert(not HAS_SEQ_IDX, "HAS_SEQ_IDX not supported with HAS_DDA_CS yet")
+            ddA_cs = dcb * cb
+            mask = offs_m[:, None] >= offs_n[None, :] + 1
+            ddA_cs = tl.where(mask, ddA_cs, 0.0)
+            ddA_cs = tl.cumsum(ddA_cs, axis=1)
+            ddA_cs = tl.where(mask, ddA_cs, 0.0)
+            ddA_cs = tl.sum(ddA_cs, axis=0)
+            tl.store(ddA_cumsum_ptrs + stride_ddA_cs_csize_n, ddA_cs, mask=offs_n < chunk_size - 1)
+            tl.store(ddA_cumsum_ptr, 0.0)
+        acc += dcb
+        dout_ptrs += stride_dout_head
+        x_ptrs += stride_x_head
+        dt_ptrs += stride_dt_head
+        dA_cumsum_ptr += stride_dA_cs_head
+        if HAS_DDA_CS:
+            ddA_cumsum_ptr += stride_ddA_cs_head
+            ddA_cumsum_ptrs += stride_ddA_cs_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    if HAS_SEQ_IDX:
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+        seq_idx_n = tl.load(seq_idx_ptr + offs_n * stride_seq_idx_seqlen, mask=offs_n < chunk_size_limit, other=-2)
+        acc = tl.where(seq_idx_m[:, None] == seq_idx_n[None, :], acc, 0.0)
+    mask = offs_m[:, None] >= offs_n[None, :]
+    acc = tl.where(mask, acc, 0.0)
+    dcb_ptr += pid_b * stride_dcb_batch + pid_c * stride_dcb_chunk + pid_g * stride_dcb_group + pid_s * stride_dcb_split
+    dcb_ptrs = dcb_ptr + (offs_m[:, None] * stride_dcb_csize_m + offs_n[None, :] * stride_dcb_csize_n)
+    tl.store(dcb_ptrs, acc, mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size))
+
+
+# Not numerically stable and should not be used. Leaving here for reference.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32}),
+        triton.Config({'BLOCK_SIZE_M': 64}),
+        triton.Config({'BLOCK_SIZE_M': 128}),
+        triton.Config({'BLOCK_SIZE_M': 256}),
+    ],
+    key=["chunk_size", "hdim"],
+)
+@triton.jit
+def _chunk_scan_bwd_ddAcs_unstable_kernel(
+    # Pointers to matrices
+    dout_ptr, out_ptr, dt_ptr, ddt_ptr, x_ptr, D_ptr,
+    ddA_cumsum_ptr, dD_ptr,
+    # Matrix dimensions
+    chunk_size, hdim,
+    batch, seqlen,
+    # Strides
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_out_batch, stride_out_seqlen, stride_out_head, stride_out_hdim,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_ddt_batch, stride_ddt_chunk, stride_ddt_head, stride_ddt_csize,
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_D_head,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
+    stride_dD_batch, stride_dD_chunk, stride_dD_head, stride_dD_csize, stride_dD_hdim,
+    # Meta-parameters
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    SUBTRACT_DDTDT: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    out_ptr += pid_b * stride_out_batch + pid_c * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    ddt_ptr += pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
+    ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head
+    if HAS_D:
+        x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+        dD_ptr += pid_b * stride_dD_batch + pid_c * stride_dD_chunk + pid_h * stride_dD_head + pid_m * stride_dD_csize
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = tl.arange(0, BLOCK_SIZE_N)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
+    out_ptrs = out_ptr + (offs_m[:, None] * stride_out_seqlen + offs_n[None, :] * stride_out_hdim)
+    if HAS_D:
+        x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+        if D_HAS_HDIM:
+            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    out = tl.load(out_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    if HAS_D:
+        x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+        if D_HAS_HDIM:
+            dD = tl.sum(dout * x, axis=0)
+            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
+            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
+        else:
+            dD = tl.sum(dout * x)
+            tl.store(dD_ptr, dD)
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        out -= x * D
+    ddA_cs = tl.sum(dout * out, axis=1)
+    if SUBTRACT_DDTDT:
+        dt = tl.load(dt_ptr + offs_m * stride_dt_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+        ddt = tl.load(ddt_ptr + offs_m * stride_ddt_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+        ddA_cs -= dt * ddt
+    tl.store(ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size)
+
+
+@triton.autotune(
+    configs=[
+        # triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4),
+        # triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8),
+        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8),
+        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8),
+        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 16}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 32}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 16}, num_stages=4, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 32}, num_stages=4, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 64}, num_stages=4, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 128}, num_stages=4, num_warps=8),
+    ],
+    key=['chunk_size', 'hdim'],
+)
+@triton.jit
+def _chunk_scan_bwd_ddAcs_stable_kernel_old(
+    # Pointers to matrices
+    x_ptr, dout_ptr, dt_ptr, dA_cumsum_ptr, cb_ptr,
+    ddAcs_ptr,
+    # Matrix dimensions
+    chunk_size, hdim,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_n,
+    stride_ddAcs_batch, stride_ddAcs_chunk, stride_ddAcs_head, stride_ddAcs_csize_m, stride_ddAcs_csize_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(chunk_size, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_k[None, :] * stride_dout_hdim)
+    x_ptrs = x_ptr + (offs_n[None, :] * stride_x_seqlen + offs_k[:, None] * stride_x_hdim)
+    dt_ptrs = dt_ptr + offs_n * stride_dt_csize
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_n[None, :] * stride_cb_csize_n)
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    chunk_size_limit_n = min(chunk_size_limit, (pid_m + 1) * BLOCK_SIZE_M)
+    # Doing a matmul loop with cumsum later on will cause Triton to crash
+    # Instead we do just one big matmul
+    # acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # for k in range(0, hdim, BLOCK_SIZE_K):
+    #     dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim - k), other=0.0)
+    #     x = tl.load(x_ptrs, mask=(offs_k[:, None] < hdim - k) & (offs_n[None, :] < chunk_size_limit), other=0.0)
+    #     acc += tl.dot(dout, x)
+    #     dout_ptrs += BLOCK_SIZE_K * stride_dout_hdim
+    #     x_ptrs += BLOCK_SIZE_K * stride_x_hdim
+    dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
+    x = tl.load(x_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < chunk_size_limit_n), other=0.0)
+    acc = tl.dot(dout, x)
+    cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size), other=0.0).to(tl.float32)
+    acc *= cb
+    dt_n = tl.load(dt_ptrs, mask=offs_n < chunk_size, other=0.0).to(tl.float32)
+    acc *= dt_n
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    dA_cs_n = tl.load(dA_cumsum_ptr + offs_n * stride_dA_cs_csize, mask=offs_n < chunk_size, other=0.0).to(tl.float32)
+    acc *= tl.exp(dA_cs_m[:, None] - dA_cs_n[None, :])
+    mask = offs_m[:, None] >= offs_n[None, :] + 1
+    acc = tl.where(mask, acc, 0.0)
+    acc = tl.cumsum(acc, axis=1)
+    acc = tl.where(mask, acc, 0.0)
+    ddA_cs = tl.sum(acc, axis=0)
+    ddAcs_ptr += pid_b * stride_ddAcs_batch + pid_c * stride_ddAcs_chunk + pid_h * stride_ddAcs_head + pid_m * stride_ddAcs_csize_m
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    ddAcs_ptrs = ddAcs_ptr + offs_n * stride_ddAcs_csize_n
+    tl.store(ddAcs_ptrs + stride_ddAcs_csize_n, ddA_cs, mask=offs_n < chunk_size - 1)
+    tl.store(ddAcs_ptr, 0.0)
+
+    # offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, 64)
+    # offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_k[None, :] * stride_dout_hdim)
+    # x_ptrs = x_ptr + (offs_n[None, :] * stride_x_seqlen + offs_k[:, None] * stride_x_hdim)
+    # dt_ptrs = dt_ptr + offs_n * stride_dt_csize
+    # cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_n[None, :] * stride_cb_csize_n)
+
+    # chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    # chunk_size_limit_n = min(chunk_size_limit, (pid_m + 1) * BLOCK_SIZE_M)
+    # rowsum = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    # dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
+    # dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    # ddAcs_ptr += pid_b * stride_ddAcs_batch + pid_c * stride_ddAcs_chunk + pid_h * stride_ddAcs_head + pid_m * stride_ddAcs_csize_m
+    # ddAcs_ptrs = ddAcs_ptr + offs_n * stride_ddAcs_csize_n
+    # for n in range(0, chunk_size_limit_n, 64):
+    #     x = tl.load(x_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < chunk_size_limit_n - n), other=0.0)
+    #     acc = tl.dot(dout, x)
+    #     cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size - n), other=0.0).to(tl.float32)
+    #     acc *= cb
+    #     dt_n = tl.load(dt_ptrs, mask=offs_n < chunk_size - n, other=0.0).to(tl.float32)
+    #     acc *= dt_n
+    #     dA_cs_n = tl.load(dA_cumsum_ptr + offs_n * stride_dA_cs_csize, mask=offs_n < chunk_size - n, other=0.0).to(tl.float32)
+    #     acc *= tl.exp(dA_cs_m[:, None] - dA_cs_n[None, :])
+    #     mask = offs_m[:, None] >= offs_n[None, :] + 1 + n
+    #     acc = tl.where(mask, acc, 0.0)
+    #     acc = tl.cumsum(acc, axis=1)
+    #     acc = tl.where(mask, acc, 0.0)
+    #     ddA_cs = tl.sum(acc, axis=0)
+    #     tl.store(ddAcs_ptrs, ddA_cs, mask=offs_n < chunk_size - 1 - n)
+    # # tl.store(ddAcs_ptr, 0.0)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4),
+    ],
+    key=['chunk_size', 'hdim'],
+)
+@triton.jit
+def _chunk_scan_bwd_ddAcs_stable_kernel(
+    # Pointers to matrices
+    x_ptr, dout_ptr, dt_ptr, dA_cumsum_ptr, cb_ptr,
+    ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, hdim,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_n,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize_m, stride_ddA_cs_csize_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+    ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head + pid_m * stride_ddA_cs_csize_m
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_k[None, :] * stride_dout_hdim)
+    x_ptrs = x_ptr + (offs_n[None, :] * stride_x_seqlen + offs_k[:, None] * stride_x_hdim)
+    dt_ptrs = dt_ptr + offs_n * stride_dt_csize
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_n[None, :] * stride_cb_csize_n)
+    ddAcs_ptrs = ddA_cumsum_ptr + offs_n * stride_ddA_cs_csize_n
+    tl.store(ddA_cumsum_ptr, 0.0)
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    rowsum = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    # Actually hi is (pid_m + 1) * BLOCK_SIZE_M - 1 but subtracting 1 makes it slower
+    lo, hi = 0, (pid_m + 1) * BLOCK_SIZE_M
+    # lo, hi = 0, chunk_size
+    for start_n in range(lo, hi, BLOCK_SIZE_N):
+        start_n = tl.multiple_of(start_n, BLOCK_SIZE_N)
+        # Doing a matmul loop with cumsum later on will cause Triton to crash
+        # Instead we do just one big matmul
+        # acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        # for k in range(0, hdim, BLOCK_SIZE_K):
+        #     dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim - k), other=0.0)
+        #     x = tl.load(x_ptrs, mask=(offs_k[:, None] < hdim - k) & (offs_n[None, :] < chunk_size_limit), other=0.0)
+        #     acc += tl.dot(dout, x)
+        #     dout_ptrs += BLOCK_SIZE_K * stride_dout_hdim
+        #     x_ptrs += BLOCK_SIZE_K * stride_x_hdim
+        # x = tl.load(x_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < chunk_size_limit_n), other=0.0)
+        x = tl.load(x_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < chunk_size_limit - start_n), other=0.0)
+        acc = tl.dot(dout, x)
+        dt_n = tl.load(dt_ptrs, mask=offs_n < chunk_size - start_n, other=0.0).to(tl.float32)
+        acc *= dt_n
+        # If there's seq_idx, we already zero'ed out cb[i, j] for seq_idx[i] != seq_idx[j]
+        cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size - start_n), other=0.0).to(tl.float32)
+        acc *= cb
+        dA_cs_n = tl.load(dA_cumsum_ptr + start_n + offs_n * stride_dA_cs_csize, mask=offs_n < chunk_size - start_n, other=0.0).to(tl.float32)
+        acc *= tl.exp(dA_cs_m[:, None] - dA_cs_n[None, :])
+        mask = offs_m[:, None] >= start_n + offs_n[None, :] + 1
+        acc = tl.where(mask, acc, 0.0)
+        rowsum_new = rowsum + tl.sum(acc, axis=1)
+        acc = rowsum[:, None] + tl.cumsum(acc, axis=1)
+        rowsum = rowsum_new
+        acc = tl.where(mask, acc, 0.0)
+        ddA_cs = tl.sum(acc, axis=0)
+        tl.store(ddAcs_ptrs + stride_ddA_cs_csize_n, ddA_cs, mask=offs_n < chunk_size - start_n - 1)
+        x_ptrs += BLOCK_SIZE_N * stride_x_seqlen
+        dt_ptrs += BLOCK_SIZE_N * stride_dt_csize
+        cb_ptrs += BLOCK_SIZE_N * stride_cb_csize_n
+        ddAcs_ptrs += BLOCK_SIZE_N * stride_ddA_cs_csize_n
+
+    # Need to zero out the rest, since we'll be summing the rows together
+    for start_n in range(hi, chunk_size, BLOCK_SIZE_N):
+        tl.store(ddAcs_ptrs + stride_ddA_cs_csize_n, tl.zeros((BLOCK_SIZE_N,), dtype=tl.float32), mask=offs_n < chunk_size - start_n - 1)
+        ddAcs_ptrs += BLOCK_SIZE_N * stride_ddA_cs_csize_n
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+    ],
+    key=['chunk_size', 'dstate', 'hdim'],
+)
+@triton.jit
+def _chunk_scan_bwd_ddAcs_prev_kernel(
+    # Pointers to matrices
+    dout_ptr, prev_states_ptr, C_ptr, dA_cumsum_ptr, seq_idx_ptr,
+    ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, dstate, hdim,
+    batch, seqlen, nchunks, nheads_ngroups_ratio,
+    # Strides
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_prev_states_batch, stride_prev_states_chunk, stride_prev_states_head, stride_prev_states_hdim, stride_prev_states_dstate,
+    stride_C_batch, stride_C_seqlen, stride_C_head, stride_C_dstate,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
+    # Meta-parameters
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    prev_states_ptr += pid_b * stride_prev_states_batch + pid_c * stride_prev_states_chunk + pid_h * stride_prev_states_head
+    C_ptr += pid_b * stride_C_batch + pid_c * chunk_size * stride_C_seqlen + (pid_h // nheads_ngroups_ratio) * stride_C_head
+    ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_k[None, :] * stride_dout_hdim)
+    prev_states_ptrs = prev_states_ptr + (offs_n[None, :] * stride_prev_states_dstate + offs_k[:, None] * stride_prev_states_hdim)
+    C_ptrs = C_ptr + (offs_m[:, None] * stride_C_seqlen + offs_n[None, :] * stride_C_dstate)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
+    prev_states = tl.load(prev_states_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate), other=0.0)
+    prev_states = prev_states.to(dout_ptrs.dtype.element_ty)
+    acc = tl.dot(dout, prev_states)
+    c = tl.load(C_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
+    ddA_cs = tl.sum(acc * c, axis=1)
+    dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+    if not HAS_SEQ_IDX:
+        scale = tl.exp(dA_cs_m)
+    if HAS_SEQ_IDX:
+        seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen, mask=pid_c >= 1, other=0)
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+        scale =  tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m), 0.0)
+    ddA_cs *= scale
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
+    tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
+
+
+def _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D=None, z=None, seq_idx=None):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = C.shape
+    assert nheads % ngroups == 0
+    assert C.shape == (batch, seqlen, ngroups, dstate)
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    # Allocates output.
+    out = torch.empty(batch, seqlen, nheads, headdim, device=x.device, dtype=x.dtype)
+    if z is not None:
+        out_x = torch.empty(batch, seqlen, nheads, headdim, device=x.device, dtype=x.dtype)
+        assert out_x.stride() == out.stride()
+    else:
+        out_x = None
+    grid = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
+                    batch * nchunks, nheads)
+    z_strides = ((z.stride(0), z.stride(1), z.stride(2), z.stride(3))
+                  if z is not None else (0, 0, 0, 0))
+    _chunk_scan_fwd_kernel[grid](
+        cb, x, z, out, out_x, dt, dA_cumsum, seq_idx, C, states, D,
+        chunk_size, headdim, dstate,
+        batch, seqlen, nheads // ngroups,
+        cb.stride(0), cb.stride(1), cb.stride(2), cb.stride(3), cb.stride(4),
+        x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+        z_strides[0], z_strides[1], z_strides[2], z_strides[3],
+        out.stride(0), out.stride(1), out.stride(2), out.stride(3),
+        dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+        dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+        *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+        C.stride(0), C.stride(1), C.stride(2), C.stride(3),
+        states.stride(0), states.stride(1), states.stride(2), states.stride(3), states.stride(4),
+        D.stride(0) if D is not None else 0,
+        True,
+        D is not None,
+        D.dim() == 2 if D is not None else True,
+        BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+        HAS_Z=z is not None,
+        HAS_SEQ_IDX=seq_idx is not None,
+        IS_TRITON_22=TRITON_22,
+    )
+    return out, out_x
+
+
+def _chunk_scan_fwd_wip(cb, x, dt, dA_cumsum, C, B, states, D=None, z=None, seq_idx=None):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = C.shape
+    assert nheads % ngroups == 0
+    assert C.shape == (batch, seqlen, ngroups, dstate)
+    assert B.shape == C.shape
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    # Allocates output.
+    out = torch.empty(batch, seqlen, nheads, headdim, device=x.device, dtype=x.dtype)
+    if z is not None:
+        out_x = torch.empty(batch, seqlen, nheads, headdim, device=x.device, dtype=x.dtype)
+        assert out_x.stride() == out.stride()
+    else:
+        out_x = None
+    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_N']), batch * nchunks, nheads)
+    z_strides = ((z.stride(0), z.stride(1), z.stride(2), z.stride(3))
+                  if z is not None else (0, 0, 0, 0))
+    _chunk_scan_fwd_kernel_wip[grid](
+        cb, x, z, out, out_x, dt, dA_cumsum, seq_idx, C, B, states, D,
+        chunk_size, headdim, dstate,
+        batch, seqlen, nheads // ngroups,
+        cb.stride(0), cb.stride(1), cb.stride(2), cb.stride(3), cb.stride(4),
+        x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+        z_strides[0], z_strides[1], z_strides[2], z_strides[3],
+        out.stride(0), out.stride(1), out.stride(2), out.stride(3),
+        dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+        dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+        *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+        C.stride(0), C.stride(1), C.stride(2), C.stride(3),
+        B.stride(0), B.stride(1), B.stride(2), B.stride(3),
+        states.stride(0), states.stride(1), states.stride(2), states.stride(3), states.stride(4),
+        D.stride(0) if D is not None else 0,
+        D is not None,
+        D.dim() == 2 if D is not None else True,
+        BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+        BLOCK_SIZE_M=128,
+        HAS_Z=z is not None,
+        HAS_SEQ_IDX=seq_idx is not None,
+    )
+    return out, out_x
+
+
+def _chunk_scan_bwd_dz(x, z, out, dout, chunk_size, has_ddAcs=True, D=None, dz=None, recompute_output=False):
+    batch, seqlen, nheads, headdim = x.shape
+    assert z.shape == x.shape
+    assert out.shape == x.shape
+    assert dout.shape == out.shape
+    nchunks = math.ceil(seqlen / chunk_size)
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+        assert D.stride(-1) == 1
+    if has_ddAcs:
+        ddA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32)
+    if D is not None:
+        BLOCK_SIZE_min = 32
+        dD = torch.empty(triton.cdiv(chunk_size, BLOCK_SIZE_min), batch, nchunks, nheads,
+                         headdim if D.dim() == 2 else 1, device=D.device, dtype=torch.float32)
+    else:
+        dD = None
+    if dz is not None:
+        assert dz.shape == z.shape
+    else:
+        dz = torch.empty_like(z)
+    if recompute_output:
+        outz = torch.empty_like(x)
+    dout_x = torch.empty_like(dout)
+    dD_strides = ((dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
+                    if D is not None else (0, 0, 0, 0, 0))
+    grid_dz = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']), batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_scan_bwd_dz_kernel[grid_dz](
+            dout, out, z, x, D, outz if recompute_output else None,
+            dz, dout_x, dD, ddA_cumsum if has_ddAcs else None,
+            chunk_size, headdim,
+            batch, seqlen,
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            out.stride(0), out.stride(1), out.stride(2), out.stride(3),
+            z.stride(0), z.stride(1), z.stride(2), z.stride(3),
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            D.stride(0) if D is not None else 0,
+            *((outz.stride(0), outz.stride(1), outz.stride(2), outz.stride(3)) if recompute_output else (0, 0, 0, 0)),
+            dz.stride(0), dz.stride(1), dz.stride(2), dz.stride(3),
+            dout_x.stride(0), dout_x.stride(1), dout_x.stride(2), dout_x.stride(3),
+            dD_strides[1], dD_strides[2], dD_strides[3], dD_strides[0], dD_strides[4],
+            *((ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3))
+              if has_ddAcs else (0, 0, 0, 0)),
+            D is not None,
+            D.dim() == 2 if D is not None else True,
+            has_ddAcs,
+            BLOCK_SIZE_N=max(triton.next_power_of_2(headdim), 16),
+            RECOMPUTE_OUTPUT=recompute_output,
+        )
+    if D is not None:
+        BLOCK_SIZE_actual = _chunk_scan_bwd_dz_kernel.best_config.kwargs["BLOCK_SIZE_M"]
+        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
+        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
+        if D.dim() == 1:
+            dD = rearrange(dD, "h 1 -> h")
+    return_vals = (dz, dout_x, dD, ddA_cumsum) if has_ddAcs else (dz, dout_x, dD)
+    return return_vals if not recompute_output else (*return_vals, outz)
+
+
+def _chunk_scan_bwd_dstates(C, dA_cumsum, dout, seq_idx=None, dtype=None):
+    batch, seqlen, nheads, headdim = dout.shape
+    _, _, nchunks, chunk_size = dA_cumsum.shape
+    _, _, ngroups, dstate = C.shape
+    assert nheads % ngroups == 0
+    assert C.shape == (batch, seqlen, ngroups, dstate)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    dtype = C.dtype if dtype is None else dtype
+    dprev_states = torch.empty(batch, nchunks, nheads, headdim, dstate, device=C.device, dtype=dtype)
+    grid_dstates = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
+                            batch * nchunks, nheads)
+    with torch.cuda.device(C.device.index):
+        _chunk_scan_bwd_dstates_kernel[grid_dstates](
+            dout, C, dprev_states, dA_cumsum, seq_idx,
+            headdim, dstate, chunk_size,
+            batch, seqlen, nchunks, nheads // ngroups,
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            C.stride(0), C.stride(1), C.stride(2), C.stride(3),
+            dprev_states.stride(0), dprev_states.stride(1), dprev_states.stride(2), dprev_states.stride(3), dprev_states.stride(4),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return dprev_states
+
+
+def _chunk_scan_bwd_dC(prev_states, dA_cumsum, dout, seq_idx=None, C=None, ngroups=1):
+    batch, nchunks, nheads, headdim, dstate = prev_states.shape
+    _, seqlen, _, _ = dout.shape
+    _, _, _, chunk_size = dA_cumsum.shape
+    assert prev_states.shape == (batch, nchunks, nheads, headdim, dstate)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    assert dout.shape == (batch, seqlen, nheads, headdim)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if C is not None:
+        assert C.shape == (batch, seqlen, ngroups, dstate)
+        C_strides = (C.stride(0), C.stride(1), C.stride(2), C.stride(3))
+        ddA_cumsum_prev = torch.empty(batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32)
+        ddA_cumsum_prev_strides = (ddA_cumsum_prev.stride(0), ddA_cumsum_prev.stride(2), ddA_cumsum_prev.stride(1), ddA_cumsum_prev.stride(3))
+    else:
+        C_strides = (0, 0, 0, 0)
+        ddA_cumsum_prev = None
+        ddA_cumsum_prev_strides = (0, 0, 0, 0)
+    nheads_ngroups_ratio = nheads // ngroups
+    sm_count = torch.cuda.get_device_properties(dout.device).multi_processor_count
+    nheads_per_program = max(min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1)
+    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
+    dC = torch.empty(batch, seqlen, nsplits, ngroups, dstate, device=dout.device, dtype=torch.float32)
+    grid_dc = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
+                        batch * nchunks, nsplits * ngroups)
+    with torch.cuda.device(dout.device.index):
+        _chunk_scan_bwd_dc_kernel[grid_dc](
+            dout, prev_states, C, dA_cumsum, seq_idx, dC, ddA_cumsum_prev,
+            chunk_size, dstate, headdim,
+            batch, seqlen, nheads, nheads_per_program, ngroups,
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            prev_states.stride(0), prev_states.stride(1), prev_states.stride(2), prev_states.stride(3), prev_states.stride(4),
+            *C_strides,
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            dC.stride(0), dC.stride(1), dC.stride(2), dC.stride(3), dC.stride(4),
+            *ddA_cumsum_prev_strides,
+            HAS_DDA_CS=ddA_cumsum_prev is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
+        )
+    dC = dC.sum(2)
+    return dC if C is None else (dC, ddA_cumsum_prev)
+
+
+def _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=None, CB=None, ngroups=1):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert dout.shape == x.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if CB is not None:
+        assert CB.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+        CB_strides = (CB.stride(0), CB.stride(1), CB.stride(2), CB.stride(3), CB.stride(4))
+        BLOCK_SIZE_M_min = 16
+        ddA_cumsum = torch.empty(batch, nheads, nchunks, triton.cdiv(chunk_size, BLOCK_SIZE_M_min),
+                                chunk_size, device=x.device, dtype=torch.float32)
+        ddA_cumsum_strides = (ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3), ddA_cumsum.stride(4))
+    else:
+        CB_strides = (0, 0, 0, 0, 0)
+        ddA_cumsum = None
+        ddA_cumsum_strides = (0, 0, 0, 0, 0)
+    nheads_ngroups_ratio = nheads // ngroups
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+    nheads_per_program = max(min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1)
+    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
+    dcb = torch.empty(batch, nchunks, nsplits, ngroups, chunk_size, chunk_size, device=x.device, dtype=torch.float32)
+    grid_dcb = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(chunk_size, META['BLOCK_SIZE_N']),
+                        batch * nchunks, nsplits * ngroups)
+    with torch.cuda.device(x.device.index):
+        _chunk_scan_bwd_dcb_kernel[grid_dcb](
+            x, dout, CB, dt, dA_cumsum, seq_idx, dcb, ddA_cumsum,
+            chunk_size, headdim,
+            batch, seqlen, nheads, nheads_per_program, ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            *CB_strides,
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            dcb.stride(0), dcb.stride(1), dcb.stride(2), dcb.stride(3), dcb.stride(4), dcb.stride(5),
+            *ddA_cumsum_strides,
+            HAS_DDA_CS=ddA_cumsum is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
+        )
+    dcb = dcb.sum(2)
+    if ddA_cumsum is not None:
+        BLOCK_SIZE_M_actual = _chunk_scan_bwd_dcb_kernel.best_config.kwargs["BLOCK_SIZE_M"]
+        n_valid_blocks = (chunk_size + BLOCK_SIZE_M_actual - 1) // BLOCK_SIZE_M_actual
+        ddA_cumsum = ddA_cumsum[:, :, :, :n_valid_blocks].sum(dim=3)
+    return dcb if CB is None else (dcb, ddA_cumsum)
+
+
+def _chunk_scan_bwd_dx(cb, x, dt, dA_cumsum, dout, D=None):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    ngroups = cb.shape[2]
+    assert nheads % ngroups == 0
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert dout.shape == x.shape
+    # if D is not None:
+    #     BLOCK_SIZE_M_min = 32
+    #     dD = torch.empty(triton.cdiv(chunk_size, BLOCK_SIZE_M_min), batch, nchunks, nheads, headdim, device=D.device, dtype=torch.float32)
+    # else:
+    #     dD = None
+    dx = torch.empty_like(x)
+    ddt = torch.empty(batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32)
+    grid_dx = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
+                        batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_scan_bwd_dx_kernel[grid_dx](
+            x, cb, dout, dt, dA_cumsum, D, dx, ddt, # dD,
+            chunk_size, headdim,
+            batch, seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            cb.stride(0), cb.stride(1), cb.stride(2), cb.stride(-1), cb.stride(-2),
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            D.stride(0) if D is not None else 0,
+            dx.stride(0), dx.stride(1), dx.stride(2), dx.stride(3),
+            ddt.stride(0), ddt.stride(2), ddt.stride(1), ddt.stride(3),
+            # dD.stride(1) if dD is not None else 0, dD.stride(2) if dD is not None else 0, dD.stride(3) if dD is not None else 0, dD.stride(4) if dD is not None else 0, dD.stride(0) if dD is not None else 0,
+            D is not None,
+            D.dim() == 2 if D is not None else True,
+        )
+    # if D is not None:
+    #     BLOCK_SIZE_actual = _chunk_scan_bwd_dx_kernel.best_config.kwargs["BLOCK_SIZE_M"]
+    #     n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
+    #     dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
+    return dx, ddt.to(dtype=dt.dtype)
+
+
+def _chunk_scan_bwd_ddAcs_unstable(x, dt, out, dout, ddt, D=None, subtract_ddtdt=True):
+    """Not numerically stable and should not be used. Leaving here for reference.
+    """
+
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert ddt.shape == dt.shape
+    assert out.shape == x.shape
+    assert dout.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+    ddA_cumsum = torch.empty_like(dt)
+    grid_ddtcs = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']), batch * nchunks, nheads)
+    if D is not None:  # Triton gives wrong results if we write to the same location
+        BLOCK_SIZE_min = 32
+        dD = torch.empty(triton.cdiv(chunk_size, BLOCK_SIZE_min), batch, nchunks, nheads,
+                         headdim if D.dim() == 2 else 1, device=D.device, dtype=torch.float32)
+    else:
+        dD = None
+    dD_strides = ((dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
+                    if D is not None else (0, 0, 0, 0, 0))
+    with torch.cuda.device(x.device.index):
+        _chunk_scan_bwd_ddAcs_unstable_kernel[grid_ddtcs](
+            dout, out, dt, ddt, x, D, ddA_cumsum, dD,
+            chunk_size, headdim,
+            batch, seqlen,
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            out.stride(0), out.stride(1), out.stride(2), out.stride(3),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            ddt.stride(0), ddt.stride(2), ddt.stride(1), ddt.stride(3),
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            D.stride(0) if D is not None else 0,
+            ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3),
+            dD_strides[1], dD_strides[2], dD_strides[3], dD_strides[0], dD_strides[4],
+            D is not None,
+            D.dim() == 2 if D is not None else True,
+            subtract_ddtdt,
+            BLOCK_SIZE_N=max(triton.next_power_of_2(headdim), 16),
+        )
+    if D is not None:
+        BLOCK_SIZE_actual = _chunk_scan_bwd_ddAcs_unstable_kernel.best_config.kwargs["BLOCK_SIZE_M"]
+        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
+        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
+        if D.dim() == 1:
+            dD = rearrange(dD, "h 1 -> h")
+    return ddA_cumsum, dD
+
+
+def _chunk_scan_bwd_ddAcs_stable_old(x, dt, dA_cumsum, dout, cb):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dout.shape == x.shape
+    assert dA_cumsum.shape == dt.shape
+    ngroups = cb.shape[2]
+    assert nheads % ngroups == 0
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    BLOCK_SIZE_M_min = 16
+    ddA_cumsum = torch.empty(batch, nheads, nchunks, triton.cdiv(chunk_size, BLOCK_SIZE_M_min),
+                             chunk_size, device=x.device, dtype=torch.float32)
+    grid_ddtcs = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']), batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_scan_bwd_ddAcs_stable_kernel_old[grid_ddtcs](
+            x, dout, dt, dA_cumsum, cb, ddA_cumsum,
+            chunk_size, headdim,
+            batch, seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            cb.stride(0), cb.stride(1), cb.stride(2), cb.stride(3), cb.stride(4),
+            ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3), ddA_cumsum.stride(4),
+            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
+            BLOCK_SIZE_N=max(triton.next_power_of_2(chunk_size), 16),
+        )
+    BLOCK_SIZE_M_actual = _chunk_scan_bwd_ddAcs_stable_kernel_old.best_config.kwargs["BLOCK_SIZE_M"]
+    n_valid_blocks = (chunk_size + BLOCK_SIZE_M_actual - 1) // BLOCK_SIZE_M_actual
+    ddA_cumsum = ddA_cumsum[:, :, :, :n_valid_blocks].sum(dim=3)
+    return ddA_cumsum
+
+
+def _chunk_scan_bwd_ddAcs_stable(x, dt, dA_cumsum, dout, cb):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dout.shape == x.shape
+    assert dA_cumsum.shape == dt.shape
+    ngroups = cb.shape[2]
+    assert nheads % ngroups == 0
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    BLOCK_SIZE_M_min = 32
+    ddA_cumsum = torch.empty(batch, nheads, nchunks, triton.cdiv(chunk_size, BLOCK_SIZE_M_min),
+                             chunk_size, device=x.device, dtype=torch.float32)
+    grid_ddtcs = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']), batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_scan_bwd_ddAcs_stable_kernel[grid_ddtcs](
+            x, dout, dt, dA_cumsum, cb, ddA_cumsum,
+            chunk_size, headdim,
+            batch, seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            cb.stride(0), cb.stride(1), cb.stride(2), cb.stride(3), cb.stride(4),
+            ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3), ddA_cumsum.stride(4),
+            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
+        )
+    BLOCK_SIZE_M_actual = _chunk_scan_bwd_ddAcs_stable_kernel.best_config.kwargs["BLOCK_SIZE_M"]
+    n_valid_blocks = (chunk_size + BLOCK_SIZE_M_actual - 1) // BLOCK_SIZE_M_actual
+    ddA_cumsum = ddA_cumsum[:, :, :, :n_valid_blocks].sum(dim=3)
+    return ddA_cumsum
+
+
+def _chunk_scan_bwd_ddAcs_prev(prev_states, C, dout, dA_cumsum, seq_idx=None):
+    batch, nchunks, nheads, headdim, dstate = prev_states.shape
+    _, seqlen, _, _ = dout.shape
+    _, _, _, chunk_size = dA_cumsum.shape
+    assert prev_states.shape == (batch, nchunks, nheads, headdim, dstate)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    assert dout.shape == (batch, seqlen, nheads, headdim)
+    ngroups = C.shape[2]
+    assert nheads % ngroups == 0
+    assert C.shape == (batch, seqlen, ngroups, dstate)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    ddA_cumsum_prev = torch.empty(batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32)
+    grid_ddAcs = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
+                          batch * nchunks, nheads)
+    with torch.cuda.device(dout.device.index):
+        _chunk_scan_bwd_ddAcs_prev_kernel[grid_ddAcs](
+            dout, prev_states, C, dA_cumsum, seq_idx, ddA_cumsum_prev,
+            chunk_size, dstate, headdim,
+            batch, seqlen, nchunks, nheads // ngroups,
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            prev_states.stride(0), prev_states.stride(1), prev_states.stride(2), prev_states.stride(3), prev_states.stride(4),
+            C.stride(0), C.stride(1), C.stride(2), C.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            ddA_cumsum_prev.stride(0), ddA_cumsum_prev.stride(2), ddA_cumsum_prev.stride(1), ddA_cumsum_prev.stride(3),
+            HAS_SEQ_IDX=seq_idx is not None,
+            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
+        )
+    return ddA_cumsum_prev
+
+
+class ChunkScanFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, B, C, x, dt, dA_cumsum, prev_states, D=None, z=None):
+        # Check constraints.
+        batch, seqlen, nheads, headdim = x.shape
+        _, _, ngroups, dstate = B.shape
+        assert B.shape == (batch, seqlen, ngroups, dstate)
+        _, _, nchunks, chunk_size = dt.shape
+        assert seqlen == nchunks * chunk_size
+        assert C.shape == B.shape
+        if z is not None:
+            assert z.shape == x.shape
+        if D is not None:
+            assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+        assert dt.shape == (batch, nheads, nchunks, chunk_size)
+        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+        assert prev_states.shape == (batch, nchunks, nheads, headdim, dstate)
+        if B.stride(-1) != 1:
+            B = B.contiguous()
+        if C.stride(-1) != 1:
+            C = C.contiguous()
+        if x.stride(-1) != 1 and x.stride(1) != 1:  # Either M or K dimension should be contiguous
+            x = x.contiguous()
+        if z is not None and z.stride(-1) != 1 and z.stride(1) != 1:  # Either M or K dimension should be contiguous
+            z = z.contiguous()
+        if D is not None and D.stride(-1) != 1:
+            D = D.contiguous()
+        CB = _bmm_chunk_fwd(C, B, chunk_size)
+        out, out_x = _chunk_scan_fwd(CB, x, dt, dA_cumsum, C, prev_states, D=D, z=z)
+        ctx.save_for_backward(out if z is None else out_x, B, C, CB, x, dt, dA_cumsum, prev_states, D, z)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        out, B, C, CB, x, dt, dA_cumsum, prev_states, D, z = ctx.saved_tensors
+        batch, seqlen, nheads, headdim = x.shape
+        _, _, nchunks, chunk_size = dt.shape
+        _, _, ngroups, dstate = B.shape
+        assert dout.shape == (batch, seqlen, nheads, headdim)
+        if z is not None:
+            dz, dout, dD, ddA_cumsum = _chunk_scan_bwd_dz(x, z, out, dout, chunk_size=chunk_size, D=D)
+        else:
+            dz = None
+        dprev_states = _chunk_scan_bwd_dstates(C, dA_cumsum, dout, dtype=prev_states.dtype)
+        dC = _chunk_scan_bwd_dC(prev_states, dA_cumsum, dout, ngroups=ngroups)
+        dC = dC.to(C.dtype)
+        dCB = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, ngroups=ngroups)
+        dCB = dCB.to(CB.dtype)
+        dB = _bmm_chunk_bwd(C, dCB)
+        dC = _bmm_chunk_bwd(B, rearrange(dCB, "... l s -> ... s l"), residual=dC)
+        dx, ddt = _chunk_scan_bwd_dx(CB, x, dt, dA_cumsum, dout, D=D)
+        # Formula for ddA_cumsum, assuming out is the output of the forward pass before adding x * D.
+        # ddA_cumsum = torch.einsum("bclhp,bclhp->bhcl", out.float(), dout.float()) - ddt * dt
+        if z is not None:
+            ddA_cumsum -= ddt * dt
+        else: # If z is not None, we already calculated ddA_cumsum and dD when computing dz
+            ddA_cumsum, dD = _chunk_scan_bwd_ddAcs_unstable(x, dt, out, dout, ddt, D=D)
+        ddA_cumsum = ddA_cumsum.to(dA_cumsum.dtype)
+        return dB, dC, dx, ddt, ddA_cumsum, dprev_states, dD, dz
+
+
+def chunk_scan(B, C, x, dt, dA_cumsum, prev_states, D=None, z=None):
+    """
+    prev_states contains the initial_states at index 0, and the state for the next-to-last chunk at index -1.
+    Argument:
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, nheads, nchunks, chunk_size)
+        dA_cumsum: (batch, nheads, nchunks, chunk_size)
+        prev_states: (batch, nchunks, nheads, headdim, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    return ChunkScanFn.apply(B, C, x, dt, dA_cumsum, prev_states, D, z)
+
+
+def chunk_scan_ref(B, C, x, dt, dA_cumsum, prev_states, D=None, z=None):
+    """
+    Argument:
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, nheads, nchunks, chunk_size)
+        dA_cumsum: (batch, nheads, nchunks, chunk_size)
+        prev_states: (batch, nchunks, nheads, headdim, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, ngroups, dstate = B.shape
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    _, _, nchunks, chunk_size = dt.shape
+    assert seqlen == nchunks * chunk_size
+    assert C.shape == B.shape
+    B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
+    C = repeat(C, "b l g d -> b l (g h) d", h=nheads // ngroups)
+    CB = torch.einsum("bclhn,bcshn->bchls", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks),
+                      rearrange(B, "b (c s) h n -> b c s h n", c=nchunks))
+    # (batch, nheads, nchunks, chunksize, chunksize)
+    dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
+    decay = torch.exp(dt_segment_sum)
+    scores_decay = CB * rearrange(decay, "b h c l s -> b c h l s")
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    scores_decay = scores_decay.masked_fill(~causal_mask, 0)
+    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
+                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
+    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(C, "b (c l) h n -> b c l h n", c=nchunks),
+                            prev_states.to(C.dtype)) * state_decay_out
+    out = out + out_prev
+    out = rearrange(out, "b c l h p -> b (c l) h p")
+    if D is not None:
+        if D.dim() == 1:
+            D = rearrange(D, "h -> h 1")
+        out = out + x * D
+    return out if z is None else out * F.silu(z)
--- a/mamba_ssm/ops/triton/ssd_chunk_state.py
+++ b/mamba_ssm/ops/triton/ssd_chunk_state.py
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+
+"""We want triton==2.1.0 or 2.2.0 for this
+"""
+
+import math
+import torch
+import torch.nn.functional as F
+
+import triton
+import triton.language as tl
+
+from einops import rearrange, repeat
+
+from mamba_ssm.ops.triton.softplus import softplus
+
+
+def init_to_zero(names):
+    return lambda nargs: [nargs[name].zero_() for name in names if nargs[name] is not None]
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_H': 1}),
+        triton.Config({'BLOCK_SIZE_H': 2}),
+        triton.Config({'BLOCK_SIZE_H': 4}),
+        triton.Config({'BLOCK_SIZE_H': 8}),
+        triton.Config({'BLOCK_SIZE_H': 16}),
+        triton.Config({'BLOCK_SIZE_H': 32}),
+        triton.Config({'BLOCK_SIZE_H': 64}),
+    ],
+    key=['chunk_size', 'nheads'],
+)
+@triton.jit
+def _chunk_cumsum_fwd_kernel(
+    # Pointers to matrices
+    dt_ptr, A_ptr, dt_bias_ptr, dt_out_ptr, dA_cumsum_ptr,
+    # Matrix dimension
+    batch, seqlen, nheads, chunk_size,
+    dt_min, dt_max,
+    # Strides
+    stride_dt_batch, stride_dt_seqlen, stride_dt_head,
+    stride_A_head,
+    stride_dt_bias_head,
+    stride_dt_out_batch, stride_dt_out_chunk, stride_dt_out_head, stride_dt_out_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    BLOCK_SIZE_H: tl.constexpr, BLOCK_SIZE_CHUNK: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=0)
+    pid_c = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
+    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
+
+    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
+    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
+    dt_ptrs = dt_ptr + (offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen)
+    A_ptrs = A_ptr + offs_h * stride_A_head
+    dt_out_ptrs = dt_out_ptr + (offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize)
+    dA_cs_ptrs = dA_cumsum_ptr + (offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize)
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    dt = tl.load(dt_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
+    if HAS_DT_BIAS:
+        dt_bias = tl.load(dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0).to(tl.float32)
+        dt += dt_bias[:, None]
+    if DT_SOFTPLUS:
+        dt = softplus(dt)
+    # As of Triton 2.2.0, tl.clamp is not available yet
+    # dt = tl.clamp(dt, dt_min, dt_max)
+    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
+    dt = tl.where((offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0)
+    tl.store(dt_out_ptrs, dt, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
+    dA = dt * A[:, None]
+    dA_cs = tl.cumsum(dA, axis=1)
+    tl.store(dA_cs_ptrs, dA_cs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_H': 1}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
+        triton.Config({'BLOCK_SIZE_H': 2}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
+        triton.Config({'BLOCK_SIZE_H': 4}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
+        triton.Config({'BLOCK_SIZE_H': 8}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
+        triton.Config({'BLOCK_SIZE_H': 16}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
+        triton.Config({'BLOCK_SIZE_H': 32}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
+        triton.Config({'BLOCK_SIZE_H': 64}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
+    ],
+    key=['chunk_size', 'nheads'],
+)
+@triton.jit
+def _chunk_cumsum_bwd_kernel(
+    # Pointers to matrices
+    ddA_ptr, ddt_out_ptr, dt_ptr, A_ptr, dt_bias_ptr,
+    ddt_ptr, dA_ptr, ddt_bias_ptr,
+    # Matrix dimensions
+    batch, seqlen, nheads, chunk_size,
+    dt_min, dt_max,
+    # Strides
+    stride_ddA_batch, stride_ddA_chunk, stride_ddA_head, stride_ddA_csize,
+    stride_ddt_out_batch, stride_ddt_out_chunk, stride_ddt_out_head, stride_ddt_out_csize,
+    stride_dt_batch, stride_dt_seqlen, stride_dt_head,
+    stride_A_head,
+    stride_dt_bias_head,
+    stride_ddt_batch, stride_ddt_seqlen, stride_ddt_head,
+    stride_dA_head,
+    stride_ddt_bias_head,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    BLOCK_SIZE_H: tl.constexpr, BLOCK_SIZE_CHUNK: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=0)
+    pid_c = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    ddt_out_ptr += pid_b * stride_ddt_out_batch + pid_c * stride_ddt_out_chunk
+    ddA_ptr += pid_b * stride_ddA_batch + pid_c * stride_ddA_chunk
+    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
+    ddt_ptr += pid_b * stride_ddt_batch + pid_c * chunk_size * stride_ddt_seqlen
+
+    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
+    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
+    ddt_out_ptrs = ddt_out_ptr + (offs_h[:, None] * stride_ddt_out_head + offs_c[None, :] * stride_ddt_out_csize)
+    ddA_ptrs = ddA_ptr + (offs_h[:, None] * stride_ddA_head + offs_c[None, :] * stride_ddA_csize)
+    dt_ptrs = dt_ptr + (offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen)
+    ddt_ptrs = ddt_ptr + (offs_h[:, None] * stride_ddt_head + offs_c[None, :] * stride_ddt_seqlen)
+    A_ptrs = A_ptr + offs_h * stride_A_head
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    ddA = tl.load(ddA_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
+    ddt_out = tl.load(ddt_out_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
+    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
+    ddt = ddA * A[:, None] + ddt_out
+    dt = tl.load(dt_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
+    if HAS_DT_BIAS:
+        dt_bias = tl.load(dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0).to(tl.float32)
+        dt += dt_bias[:, None]
+    if DT_SOFTPLUS:
+        dt_presoftplus = dt
+        dt = softplus(dt)
+    clamp_mask = (dt < dt_min) | (dt > dt_max)
+    # As of Triton 2.2.0, tl.clamp is not available yet
+    # dt = tl.clamp(dt, dt_min, dt_max)
+    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
+    dt = tl.where((offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0)
+    ddt = tl.where((offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), ddt, 0.0)
+    ddt = tl.where(clamp_mask, 0.0, ddt)
+    if DT_SOFTPLUS:
+        ddt = tl.where(dt_presoftplus <= 20.0, ddt * tl.sigmoid(dt_presoftplus), ddt)
+    tl.store(ddt_ptrs, ddt, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit))
+    dA = tl.sum(ddA * dt, axis=1)
+    tl.atomic_add(dA_ptr + offs_h * stride_dA_head, dA, mask=offs_h < nheads)
+    if HAS_DT_BIAS:
+        ddt_bias = tl.sum(ddt, axis=1)
+        tl.atomic_add(ddt_bias_ptr + offs_h * stride_ddt_bias_head, ddt_bias, mask=offs_h < nheads)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=2),
+    ],
+    key=['hdim', 'dstate', 'chunk_size'],
+)
+@triton.jit
+def _chunk_state_fwd_kernel(
+    # Pointers to matrices
+    x_ptr, b_ptr, states_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr,
+    # Matrix dimensions
+    hdim, dstate, chunk_size,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
+    stride_states_batch, stride_states_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen)
+    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    if HAS_SEQ_IDX:
+        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(x_ptrs, mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k), other=0.0)
+        b = tl.load(b_ptrs, mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
+        if HAS_SEQ_IDX:
+            seq_idx_k = tl.load(seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
+        if not HAS_SEQ_IDX:
+            scale = tl.exp((dA_cs_last - dA_cs_k)) * dt_k
+        else:
+            scale = tl.where(seq_idx_k == seq_idx_last, tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0)
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+        if HAS_SEQ_IDX:
+            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += pid_b * stride_states_batch + pid_c * stride_states_chunk + pid_h * stride_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate)
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
+    ],
+    key=['chunk_size', 'hdim', 'dstate'],
+)
+@triton.jit
+def _chunk_state_bwd_dx_kernel(
+    # Pointers to matrices
+    x_ptr, b_ptr, dstates_ptr, dt_ptr, dA_cumsum_ptr,
+    dx_ptr, ddt_ptr, ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, hdim, dstate,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
+    stride_dstates_batch, stride_dstates_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_dx_batch, stride_dx_seqlen, stride_dx_head, stride_dx_hdim,
+    stride_ddt_batch, stride_ddt_chunk, stride_ddt_head, stride_ddt_csize,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + pid_h * stride_states_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    ddt_ptr += pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
+    ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+    offs_k = tl.arange(0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
+    b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate)
+    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate)
+    if BLOCK_SIZE_DSTATE <= 128:
+        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate), other=0.0)
+        dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
+        dstates = dstates.to(b_ptr.dtype.element_ty)
+        acc = tl.dot(b, dstates)
+    else:
+        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, dstate, BLOCK_SIZE_K):
+            b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate - k), other=0.0)
+            dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim), other=0.0)
+            dstates = dstates.to(b_ptr.dtype.element_ty)
+            acc += tl.dot(b, dstates)
+            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
+            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
+    dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    acc *= tl.exp(dA_cs_last - dA_cs_m)[:, None]
+
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+    x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    ddt = tl.sum(acc * x, axis=1)
+    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
+    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
+    ddA_cs = -(ddt * dt_m)
+    ddA_cs_last = -tl.sum(ddA_cs)
+    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
+    tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
+    tl.atomic_add(ddA_cumsum_ptr + (chunk_size - 1) * stride_ddA_cs_csize, ddA_cs_last)
+
+    dx = (acc * dt_m[:, None]).to(dx_ptr.dtype.element_ty)
+    dx_ptr += pid_b * stride_dx_batch + pid_c * chunk_size * stride_dx_seqlen + pid_h * stride_dx_head
+    dx_ptrs = dx_ptr + (offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim)
+    tl.store(dx_ptrs, dx, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+    ],
+    key=['chunk_size', 'dstate', 'hdim'],
+)
+@triton.jit
+def _chunk_state_bwd_db_kernel(
+    # Pointers to matrices
+    x_ptr, dstates_ptr, b_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr,
+    db_ptr, ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, dstate, hdim,
+    batch, seqlen, nheads, nheads_per_program, ngroups,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_dstates_batch, stride_dstates_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
+    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_db_batch, stride_db_seqlen, stride_db_split, stride_db_group, stride_db_dstate,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
+    # Meta-parameters
+    HAS_DDA_CS: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_sg = tl.program_id(axis=2)
+    pid_s = pid_sg // ngroups
+    pid_g = pid_sg - pid_s * ngroups
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_x_head
+    db_ptr += pid_b * stride_db_batch + pid_c * chunk_size * stride_db_seqlen + pid_g * stride_db_group + pid_s * stride_db_split
+    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_states_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
+    if HAS_DDA_CS:
+        b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + pid_g * stride_b_head
+        ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_ddA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_k[None, :] * stride_x_hdim)
+    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_states_dstate + offs_k[:, None] * stride_states_hdim)
+    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
+    if HAS_DDA_CS:
+        b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_n[None, :] * stride_b_dstate)
+        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    if HAS_DDA_CS:
+        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
+    if HAS_SEQ_IDX:
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+    nheads_iter = min(nheads_per_program, nheads // ngroups - pid_s * nheads_per_program)
+    for h in range(nheads_iter):
+        x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
+        dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate), other=0.0)
+        dstates = dstates.to(x_ptrs.dtype.element_ty)
+        db = tl.dot(x, dstates)
+        dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+        dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+        dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+        if not HAS_SEQ_IDX:
+            scale = tl.exp(dA_cs_last - dA_cs_m)
+        else:
+            scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
+        db *= (scale * dt_m)[:, None]
+        if HAS_DDA_CS:
+            # This is the gradient wrt (dA_cs_last - dA_cs_m), i.e. the exclusive reverse cumsum
+            ddA_cs = tl.sum(db * b, axis=1)
+            tl.atomic_add(ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1)
+        acc += db
+        x_ptrs += stride_x_head
+        dstates_ptrs += stride_states_head
+        dt_ptrs += stride_dt_head
+        dA_cumsum_ptr += stride_dA_cs_head
+        dA_cumsum_ptrs += stride_dA_cs_head
+        if HAS_DDA_CS:
+            ddA_cumsum_ptrs += stride_ddA_cs_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    # if HAS_SEQ_IDX:
+    #     seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+    #     seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+    #     acc = tl.where(seq_idx_m[:, None] == seq_idx_last, acc, 0.0)
+    db_ptrs = db_ptr + (offs_m[:, None] * stride_db_seqlen + offs_n[None, :] * stride_db_dstate)
+    tl.store(db_ptrs, acc, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate))
+
+
+@triton.autotune(
+    configs=[
+        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+        triton.Config({'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
+    ],
+    key=['chunk_size', 'hdim', 'dstate'],
+)
+@triton.jit
+def _chunk_state_bwd_ddAcs_stable_kernel(
+    # Pointers to matrices
+    x_ptr, b_ptr, dstates_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr,
+    ddA_cumsum_ptr,
+    # Matrix dimensions
+    chunk_size, hdim, dstate,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
+    stride_dstates_batch, stride_dstates_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
+    # Meta-parameters
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + pid_h * stride_states_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+    offs_k = tl.arange(0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
+    b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate)
+    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate)
+    if BLOCK_SIZE_DSTATE <= 128:
+        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate), other=0.0)
+        dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
+        dstates = dstates.to(b_ptr.dtype.element_ty)
+        acc = tl.dot(b, dstates)
+    else:
+        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, dstate, BLOCK_SIZE_K):
+            b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate - k), other=0.0)
+            dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim), other=0.0)
+            dstates = dstates.to(b_ptr.dtype.element_ty)
+            acc += tl.dot(b, dstates)
+            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
+            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    if not HAS_SEQ_IDX:
+        scale = tl.exp(dA_cs_last - dA_cs_m)
+    else:
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
+    acc *= scale[:, None]
+
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+    x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
+    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
+    ddt = tl.sum(acc * x, axis=1)
+    # ddA_cs = -(ddt * dt_m)
+    # Triton 2.2.0 errors if we have the cumsum here, so we just write it out
+    # then call torch.cumsum outside this kernel.
+    # ddA_cs = tl.cumsum(ddt * dt_m)
+    ddA_cs = ddt * dt_m
+    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
+    # tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
+    tl.atomic_add(ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=2),
+    ],
+    key=['hdim', 'dstate', 'chunk_size'],
+)
+@triton.jit
+def _chunk_state_varlen_kernel(
+    # Pointers to matrices
+    x_ptr, b_ptr, dt_ptr, dA_cumsum_ptr, chunk_states_ptr, cu_seqlens_ptr, states_ptr,
+    # Matrix dimensions
+    hdim, dstate, chunk_size,
+    seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_b_seqlen, stride_b_head, stride_b_dstate,
+    stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_chunk_states_chunk, stride_chunk_states_head, stride_chunk_states_hdim, stride_chunk_states_dstate,
+    stride_states_batch, stride_states_head, stride_states_hdim, stride_states_dstate,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
+    pid_c = (end_idx - 1) // chunk_size
+    b_ptr += pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    chunk_states_ptr += pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen)
+    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+
+    chunk_size_limit = end_idx - pid_c * chunk_size
+    start_idx = tl.load(cu_seqlens_ptr + pid_b)
+    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(x_ptrs, mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k) & (offs_k[None, :] >= start_idx_cur - k), other=0.0)
+        b = tl.load(b_ptrs, mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate) & (offs_k[:, None] >= start_idx_cur - k), other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
+        scale = tl.where((offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
+                         tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0)
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
+    if start_idx < pid_c * chunk_size:
+        chunk_states_ptrs = chunk_states_ptr + (offs_m[:, None] * stride_chunk_states_hdim + offs_n[None, :] * stride_chunk_states_dstate)
+        chunk_states = tl.load(chunk_states_ptrs, mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
+        # scale = tl.where(start_idx < pid_c * chunk_size, tl.exp(dA_cs_last), 0.0)
+        scale = tl.exp(dA_cs_last)
+        acc += chunk_states * scale
+
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate)
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+def _chunk_cumsum_fwd(dt, A, chunk_size, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))):
+    batch, seqlen, nheads = dt.shape
+    assert A.shape == (nheads,)
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads,)
+    nchunks = math.ceil(seqlen / chunk_size)
+    dt_out = torch.empty(batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32)
+    dA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32)
+    grid_chunk_cs = lambda META: (batch, nchunks, triton.cdiv(nheads, META['BLOCK_SIZE_H']))
+    with torch.cuda.device(dt.device.index):
+        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
+            dt, A, dt_bias, dt_out, dA_cumsum,
+            batch, seqlen, nheads, chunk_size,
+            dt_limit[0], dt_limit[1],
+            dt.stride(0), dt.stride(1), dt.stride(2),
+            A.stride(0),
+            dt_bias.stride(0) if dt_bias is not None else 0,
+            dt_out.stride(0), dt_out.stride(2), dt_out.stride(1), dt_out.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            dt_softplus,
+            HAS_DT_BIAS=dt_bias is not None,
+            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
+        )
+    return dA_cumsum, dt_out
+
+
+def _chunk_cumsum_bwd(ddA, ddt_out, dt, A, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf")), ddt=None):
+    batch, seqlen, nheads = dt.shape
+    _, _, nchunks, chunk_size = ddA.shape
+    assert ddA.shape == (batch, nheads, nchunks, chunk_size)
+    assert ddt_out.shape == (batch, nheads, nchunks, chunk_size)
+    assert A.shape == (nheads,)
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads,)
+        ddt_bias = torch.empty_like(dt_bias, dtype=torch.float32)
+    else:
+        ddt_bias = None
+    if ddt is not None:
+        assert ddt.shape == dt.shape
+    else:
+        ddt = torch.empty_like(dt)
+    dA = torch.empty_like(A, dtype=torch.float32)
+    grid_chunk_cs = lambda META: (batch, nchunks, triton.cdiv(nheads, META['BLOCK_SIZE_H']))
+    with torch.cuda.device(dt.device.index):
+        _chunk_cumsum_bwd_kernel[grid_chunk_cs](
+            ddA, ddt_out, dt, A, dt_bias, ddt, dA, ddt_bias,
+            batch, seqlen, nheads, chunk_size,
+            dt_limit[0], dt_limit[1],
+            ddA.stride(0), ddA.stride(2), ddA.stride(1), ddA.stride(3),
+            ddt_out.stride(0), ddt_out.stride(2), ddt_out.stride(1), ddt_out.stride(3),
+            dt.stride(0), dt.stride(1), dt.stride(2),
+            A.stride(0),
+            dt_bias.stride(0) if dt_bias is not None else 0,
+            ddt.stride(0), ddt.stride(1), ddt.stride(2),
+            dA.stride(0),
+            ddt_bias.stride(0) if ddt_bias is not None else 0,
+            dt_softplus,
+            HAS_DT_BIAS=dt_bias is not None,
+            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
+        )
+    return ddt, dA, ddt_bias
+
+
+def _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=None, states=None, states_in_fp32=True):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if states is not None:
+        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+    else:
+        states_dtype = torch.float32 if states_in_fp32 else B.dtype
+        states = torch.empty((batch, nchunks, nheads, headdim, dstate), device=x.device, dtype=states_dtype)
+    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
+                    batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_fwd_kernel[grid](
+            x, B, states, dt, dA_cumsum, seq_idx,
+            headdim, dstate, chunk_size,
+            batch, seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            B.stride(0), B.stride(1), B.stride(2), B.stride(-1),
+            states.stride(0), states.stride(1), states.stride(2), states.stride(3), states.stride(4),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return states
+
+
+def _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates, dx=None):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
+    if dx is not None:
+        assert dx.shape == x.shape
+    else:
+        dx = torch.empty_like(x)
+    ddt = torch.empty(batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32)
+    ddA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=dA_cumsum.device, dtype=torch.float32)
+    grid_dx = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
+                       batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_bwd_dx_kernel[grid_dx](
+            x, B, dstates, dt, dA_cumsum, dx, ddt, ddA_cumsum,
+            chunk_size, headdim, dstate,
+            batch, seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            B.stride(0), B.stride(1), B.stride(2), B.stride(-1),
+            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            dx.stride(0), dx.stride(1), dx.stride(2), dx.stride(3),
+            ddt.stride(0), ddt.stride(2), ddt.stride(1), ddt.stride(3),
+            ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3),
+            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+        )
+    return dx, ddt.to(dt.dtype), ddA_cumsum.to(dA_cumsum.dtype)
+
+
+def _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=None, B=None, ngroups=1):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    dstate = dstates.shape[-1]
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if B is not None:
+        assert B.shape == (batch, seqlen, ngroups, dstate)
+        B_strides = (B.stride(0), B.stride(1), B.stride(2), B.stride(3))
+        # Use torch.empty since the Triton kernel will call init_to_zero
+        ddA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32)
+        ddA_cumsum_strides = (ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3))
+    else:
+        B_strides = (0, 0, 0, 0)
+        ddA_cumsum = None
+        ddA_cumsum_strides = (0, 0, 0, 0)
+    nheads_ngroups_ratio = nheads // ngroups
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+    nheads_per_program = max(min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1)
+    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
+    dB = torch.empty(batch, seqlen, nsplits, ngroups, dstate, device=x.device, dtype=torch.float32)
+    grid_db = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
+                        batch * nchunks, nsplits * ngroups)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_bwd_db_kernel[grid_db](
+            x, dstates, B, dt, dA_cumsum, seq_idx, dB, ddA_cumsum,
+            chunk_size, dstate, headdim,
+            batch, seqlen, nheads, nheads_per_program, ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
+            *B_strides,
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            dB.stride(0), dB.stride(1), dB.stride(2), dB.stride(3), dB.stride(4),
+            *ddA_cumsum_strides,
+            HAS_DDA_CS=ddA_cumsum is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
+        )
+    dB = dB.sum(2)
+    if ddA_cumsum is not None:
+        # The first element of ddA_cumsum is always zero, since that dA_cumsum does not contribute
+        # to the state of the chunk.
+        # torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
+        # But it's easier to just do the cumsum for all elements, the result will be the same.
+        torch.cumsum(ddA_cumsum, dim=-1, out=ddA_cumsum)
+    return dB if B is None else (dB, ddA_cumsum)
+
+
+def _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=None):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    # Use torch.empty since the Triton kernel will call init_to_zero
+    ddA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32)
+    grid_ddtcs = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
+                          batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_bwd_ddAcs_stable_kernel[grid_ddtcs](
+            x, B, dstates, dt, dA_cumsum, seq_idx, ddA_cumsum,
+            chunk_size, headdim, dstate,
+            batch, seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            B.stride(0), B.stride(1), B.stride(2), B.stride(-1),
+            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3),
+            HAS_SEQ_IDX=seq_idx is not None,
+            BLOCK_SIZE_M=max(triton.next_power_of_2(chunk_size), 16),
+            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+        )
+    torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
+    return ddA_cumsum
+
+
+def chunk_state_varlen(B, x, dt, dA_cumsum, cu_seqlens, chunk_states):
+    total_seqlen, nheads, headdim = x.shape
+    _, nchunks, chunk_size = dt.shape
+    _, ngroups, dstate = B.shape
+    batch = cu_seqlens.shape[0] - 1
+    cu_seqlens = cu_seqlens.contiguous()
+    assert nheads % ngroups == 0
+    assert B.shape == (total_seqlen, ngroups, dstate)
+    assert dt.shape == (nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
+    states = torch.empty(batch, nheads, headdim, dstate, dtype=chunk_states.dtype, device=chunk_states.device)
+    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
+                    batch, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_varlen_kernel[grid](
+            x, B, dt, dA_cumsum, chunk_states, cu_seqlens, states,
+            headdim, dstate, chunk_size,
+            total_seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2),
+            B.stride(0), B.stride(1), B.stride(2),
+            dt.stride(1), dt.stride(0), dt.stride(2),
+            dA_cumsum.stride(1), dA_cumsum.stride(0), dA_cumsum.stride(2),
+            chunk_states.stride(0), chunk_states.stride(1), chunk_states.stride(2), chunk_states.stride(3),
+            states.stride(0), states.stride(1), states.stride(2), states.stride(3),
+        )
+    return states
+
+
+class ChunkStateFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, B, x, dt, dA_cumsum, states_in_fp32=True):
+        batch, seqlen, nheads, headdim = x.shape
+        _, _, nchunks, chunk_size = dt.shape
+        assert seqlen <= nchunks * chunk_size
+        _, _, ngroups, dstate = B.shape
+        assert B.shape == (batch, seqlen, ngroups, dstate)
+        assert dt.shape == (batch, nheads, nchunks, chunk_size)
+        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+        if B.stride(-1) != 1:
+            B = B.contiguous()
+        if x.stride(-1) != 1 and x.stride(1) != 1:  # Either M or K dimension should be contiguous
+            x = x.contiguous()
+        states = _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=states_in_fp32)
+        ctx.save_for_backward(B, x, dt, dA_cumsum)
+        return states
+
+    @staticmethod
+    def backward(ctx, dstates):
+        B, x, dt, dA_cumsum = ctx.saved_tensors
+        batch, seqlen, nheads, headdim = x.shape
+        _, _, nchunks, chunk_size = dt.shape
+        _, _, ngroups, dstate = B.shape
+        assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
+        if dstates.stride(-1) != 1:
+            dstates = dstates.contiguous()
+        dx, ddt, ddA_cumsum = _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates)
+        dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, ngroups=ngroups)
+        dB = dB.to(B.dtype)
+        return dB, dx, ddt, ddA_cumsum, None
+
+
+def chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True):
+    """
+    Argument:
+        B: (batch, seqlen, ngroups, headdim)
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, nheads, nchunks, chunk_size)
+        dA_cumsum: (batch, nheads, nchunks, chunk_size)
+    Return:
+        states: (batch, nchunks, nheads, headdim, dstate)
+    """
+    return ChunkStateFn.apply(B, x, dt, dA_cumsum, states_in_fp32)
+
+
+def chunk_state_ref(B, x, dt, dA_cumsum):
+    """
+    Argument:
+        B: (batch, seqlen, ngroups, headdim)
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, nheads, nchunks, chunk_size)
+        dA_cumsum: (batch, nheads, nchunks, chunk_size)
+    Return:
+        states: (batch, nchunks, nheads, headdim, dstate)
+    """
+    # Check constraints.
+    batch, seqlen, nheads, headdim = x.shape
+    dstate = B.shape[-1]
+    _, _, nchunks, chunk_size = dt.shape
+    assert seqlen <= nchunks * chunk_size
+    assert x.shape == (batch, seqlen, nheads, headdim)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    ngroups = B.shape[2]
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    if seqlen < nchunks * chunk_size:
+        x = F.pad(x, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
+        B = F.pad(B, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
+    x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
+    B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
+    decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
+    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype), dt.to(x.dtype), x)
--- a/mamba_ssm/ops/triton/ssd_combined.py
+++ b/mamba_ssm/ops/triton/ssd_combined.py
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+
+"""We want triton==2.1.0 or 2.2.0 for this
+"""
+
+from typing import Optional
+
+import math
+from packaging import version
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+import triton
+import triton.language as tl
+
+from einops import rearrange, repeat
+
+try:
+    from causal_conv1d import causal_conv1d_fn
+    import causal_conv1d_cuda
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_cuda = None, None
+
+from mamba_ssm.ops.triton.ssd_bmm import _bmm_chunk_fwd, _bmm_chunk_bwd
+from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_cumsum_fwd, _chunk_cumsum_bwd
+from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_state_fwd, _chunk_state_bwd_db
+from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_state_bwd_ddAcs_stable
+from mamba_ssm.ops.triton.ssd_chunk_state import chunk_state, chunk_state_ref
+from mamba_ssm.ops.triton.ssd_chunk_state import chunk_state_varlen
+from mamba_ssm.ops.triton.ssd_state_passing import _state_passing_fwd, _state_passing_bwd
+from mamba_ssm.ops.triton.ssd_state_passing import state_passing, state_passing_ref
+from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd, _chunk_scan_bwd_dz, _chunk_scan_bwd_dstates
+from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_bwd_dC, _chunk_scan_bwd_dcb
+from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_bwd_ddAcs_stable
+from mamba_ssm.ops.triton.ssd_chunk_scan import chunk_scan, chunk_scan_ref
+from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_bwd_ddAcs_prev
+from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn, _layer_norm_fwd, _layer_norm_bwd
+from mamba_ssm.ops.triton.k_activations import _swiglu_fwd, _swiglu_bwd
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+
+
+def init_to_zero(names):
+    return lambda nargs: [nargs[name].zero_() for name in names if nargs[name] is not None]
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
+    ],
+    key=['chunk_size', 'hdim', 'dstate'],
+)
+@triton.jit
+def _chunk_scan_chunk_state_bwd_dx_kernel(
+    # Pointers to matrices
+    x_ptr, cb_ptr, dout_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr, D_ptr,
+    b_ptr, dstates_ptr,
+    dx_ptr, ddt_ptr, dD_ptr,
+    # Matrix dimensions
+    chunk_size, hdim, dstate,
+    batch, seqlen, nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
+    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_k,
+    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
+    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_D_head,
+    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
+    stride_dstates_batch, stride_dstates_chunk, stride_dstates_head, stride_dstates_hdim, stride_dstates_dstate,
+    stride_dx_batch, stride_dx_seqlen, stride_dx_head, stride_dx_hdim,
+    stride_ddt_batch, stride_ddt_chunk, stride_ddt_head, stride_ddt_csize,
+    stride_dD_batch, stride_dD_chunk, stride_dD_head, stride_dD_csize, stride_dD_hdim,
+    # Meta-parameters
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+    IS_TRITON_22: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    ddt_ptr += pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + pid_h * stride_dstates_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+
+    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    if not HAS_SEQ_IDX:
+        scale = tl.exp(dA_cs_last - dA_cs_m)
+    else:
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
+        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
+    # Might be faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+    # However, we're getting error with the Triton compiler 2.1.0 for that code path:
+    # Unexpected mma -> mma layout conversion
+    # Triton 2.2.0 fixes this
+    offs_dstate = tl.arange(0, BLOCK_SIZE_DSTATE if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
+    b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_dstate[None, :] * stride_b_dstate)
+    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_dstates_hdim + offs_dstate[:, None] * stride_dstates_dstate)
+    if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128:
+        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate), other=0.0)
+        dstates = tl.load(dstates_ptrs, mask=(offs_dstate[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
+        dstates = dstates.to(b_ptr.dtype.element_ty)
+        acc = tl.dot(b, dstates) * scale[:, None]
+    else:
+        for k in range(0, dstate, BLOCK_SIZE_K):
+            b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate - k), other=0.0)
+            dstates = tl.load(dstates_ptrs, mask=(offs_dstate[:, None] < dstate - k) & (offs_n[None, :] < hdim), other=0.0)
+            dstates = dstates.to(b_ptr.dtype.element_ty)
+            acc += tl.dot(b, dstates)
+            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
+            dstates_ptrs += BLOCK_SIZE_K * stride_dstates_dstate
+        acc *= scale[:, None]
+
+    # x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+    # x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    # dt_ptrs = dt_ptr + offs_m * stride_dt_csize
+    # dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+    # ddt = tl.sum(acc * x, axis=1) * dt_m
+    # ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
+    # tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k)
+    dout_ptrs = dout_ptr + (offs_k[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    K_MAX = chunk_size_limit
+    K_MIN = pid_m * BLOCK_SIZE_M
+    cb_ptrs += K_MIN * stride_cb_csize_k
+    dout_ptrs += K_MIN * stride_dout_seqlen
+    dA_cumsum_ptrs += K_MIN * stride_dA_cs_csize
+    for k in range(K_MIN, K_MAX, BLOCK_SIZE_K):
+        k = tl.multiple_of(k, BLOCK_SIZE_K)
+        # For some reason setting mask to (offs_m[:, None] < chunk_size_limit) is much slower
+        cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < K_MAX - k), other=0.0)
+        dout = tl.load(dout_ptrs, mask=(offs_k[:, None] < K_MAX - k) & (offs_n[None, :] < hdim), other=0.0)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < K_MAX - k, other=0.0).to(tl.float32)
+        cb *= tl.exp(dA_cs_k[None, :] - dA_cs_m[:, None])
+        # If we don't have the (k + offs_k[None, :] < K_MAX) mask, for indices outside this range,
+        # we might have dA_cs_m = 0.0 and dA_cs_k very negative, and tl.exp will return inf.
+        # Multiplying with cb, which is 0.0 outside the range, will make the result NaN.
+        # This will cause NaN in acc, and hence NaN in dx and ddt.
+        mask = (k + offs_k[None, :] >= offs_m[:, None]) & (k + offs_k[None, :] < K_MAX)
+        cb = tl.where(mask, cb, 0.0)
+        cb = cb.to(dout_ptr.dtype.element_ty)
+        acc += tl.dot(cb, dout)
+        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
+        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
+    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
+    dx = acc * dt_m[:, None]
+    dx_ptr += pid_b * stride_dx_batch + pid_c * chunk_size * stride_dx_seqlen + pid_h * stride_dx_head
+    dx_ptrs = dx_ptr + (offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim)
+    if HAS_D:
+        dout_res_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
+        dout_res = tl.load(dout_res_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+        if D_HAS_HDIM:
+            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
+        else:
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        dx += dout_res * D
+    tl.store(dx_ptrs, dx, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
+
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
+    x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
+    if HAS_D:
+        dD_ptr += pid_b * stride_dD_batch + pid_c * stride_dD_chunk + pid_h * stride_dD_head + pid_m * stride_dD_csize
+        if D_HAS_HDIM:
+            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
+            dD = tl.sum(dout_res * x, axis=0)
+            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
+        else:
+            dD = tl.sum(dout_res * x)
+            tl.store(dD_ptr, dD)
+    ddt = tl.sum(acc * x, axis=1)
+    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
+    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
+
+
+def _chunk_scan_chunk_state_bwd_dx(x, dt, dA_cumsum, B, CB, dout, dstates, D=None, seq_idx=None, dx=None):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert CB.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert dout.shape == x.shape
+    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+        assert D.stride(-1) == 1
+        BLOCK_SIZE_min = 32
+        dD = torch.empty(triton.cdiv(chunk_size, BLOCK_SIZE_min), batch, nchunks, nheads,
+                         headdim if D.dim() == 2 else 1, device=D.device, dtype=torch.float32)
+    else:
+        dD = None
+    dD_strides = ((dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
+                    if D is not None else (0, 0, 0, 0, 0))
+    if dx is None:
+        dx = torch.empty_like(x)
+    else:
+        assert dx.shape == x.shape
+    ddt = torch.empty(batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32)
+    grid_dx = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
+                        batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_scan_chunk_state_bwd_dx_kernel[grid_dx](
+            x, CB, dout, dt, dA_cumsum, seq_idx, D, B, dstates, dx, ddt, dD,
+            chunk_size, headdim, dstate,
+            batch, seqlen, nheads // ngroups,
+            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+            CB.stride(0), CB.stride(1), CB.stride(2), CB.stride(-1), CB.stride(-2),
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
+            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            D.stride(0) if D is not None else 0,
+            B.stride(0), B.stride(1), B.stride(2), B.stride(3),
+            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
+            dx.stride(0), dx.stride(1), dx.stride(2), dx.stride(3),
+            ddt.stride(0), ddt.stride(2), ddt.stride(1), ddt.stride(3),
+            dD_strides[1], dD_strides[2], dD_strides[3], dD_strides[0], dD_strides[4],
+            D is not None,
+            D.dim() == 2 if D is not None else True,
+            HAS_SEQ_IDX=seq_idx is not None,
+            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+            IS_TRITON_22=TRITON_22
+        )
+    if D is not None:
+        BLOCK_SIZE_actual = _chunk_scan_chunk_state_bwd_dx_kernel.best_config.kwargs["BLOCK_SIZE_M"]
+        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
+        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
+        if D.dim() == 1:
+            dD = rearrange(dD, "h 1 -> h")
+    return dx, ddt.to(dtype=dt.dtype), dD
+
+
+def _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf"))):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert x.shape == (batch, seqlen, nheads, headdim)
+    assert dt.shape == (batch, seqlen, nheads)
+    assert A.shape == (nheads,)
+    assert C.shape == B.shape
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if x.stride(-1) != 1 and x.stride(1) != 1:  # Either M or K dimension should be contiguous
+        x = x.contiguous()
+    if z is not None and z.stride(-1) != 1 and z.stride(1) != 1:  # Either M or K dimension should be contiguous
+        z = z.contiguous()
+    if D is not None and D.stride(-1) != 1:
+        D = D.contiguous()
+    if initial_states is not None:
+        assert initial_states.shape == (batch, nheads, headdim, dstate)
+    # # (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, nheads, chunk_size, chunk_size)
+    # dA_cumsum_tmp0, dt_tmp0 = _chunk_cumsum_fwd(dt[:, :147], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
+    # dA_cumsum_tmp1, dt_tmp1 = _chunk_cumsum_fwd(dt[:, 147:], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
+    # dA_cumsum_tmp2, dt_tmp2 = _chunk_cumsum_fwd(dt[:, 147:256], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
+    dA_cumsum, dt = _chunk_cumsum_fwd(dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit)
+    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
+    # states_tmp0 = _chunk_state_fwd(B[:, :147], x[:, :147], dt_tmp0, dA_cumsum_tmp0, states_in_fp32=True)
+    # states_tmp1 = _chunk_state_fwd(B[:, 147:], x[:, 147:], dt_tmp1, dA_cumsum_tmp1, states_in_fp32=True)
+    # states_tmp2 = _chunk_state_fwd(B[:, 147:256], x[:, 147:256], dt_tmp2, dA_cumsum_tmp2, states_in_fp32=True)
+    states, final_states = _state_passing_fwd(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1],
+                                              initial_states=rearrange(initial_states, "... p n -> ... (p n)") if initial_states is not None else None,
+                                              seq_idx=seq_idx, chunk_size=chunk_size, out_dtype=C.dtype)
+    states, final_states = [rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]]
+    # states_tmp0 = rearrange(_state_passing_fwd(rearrange(states_tmp0, "... p n -> ... (p n)"), dA_cumsum_tmp0[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
+    # states_tmp1 = rearrange(_state_passing_fwd(rearrange(states_tmp1, "... p n -> ... (p n)"), dA_cumsum_tmp1[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
+    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
+    out, out_x = _chunk_scan_fwd(CB, x, dt, dA_cumsum, C, states, D=D, z=z, seq_idx=seq_idx)
+    if cu_seqlens is None:
+        return out, out_x, dt, dA_cumsum, states, final_states
+    else:
+        assert batch == 1, "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
+        varlen_states = chunk_state_varlen(B.squeeze(0), x.squeeze(0), dt.squeeze(0), dA_cumsum.squeeze(0),
+                                           cu_seqlens, states.squeeze(0))
+        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
+
+
+def _mamba_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, chunk_size, D=None, z=None,
+                                   dt_bias=None, initial_states=None, dfinal_states=None, seq_idx=None, dt_softplus=False,
+                                   dt_limit=(0.0, float("inf")),
+                                   dx=None, ddt=None, dB=None, dC=None, dz=None, recompute_output=False):
+    if dout.stride(-1) != 1:
+        dout = dout.contiguous()
+    batch, seqlen, nheads, headdim = x.shape
+    nchunks = math.ceil(seqlen / chunk_size)
+    _, _, ngroups, dstate = B.shape
+    assert dout.shape == (batch, seqlen, nheads, headdim)
+    assert dt.shape == (batch, seqlen, nheads)
+    assert A.shape == (nheads,)
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert C.shape == B.shape
+    assert out.shape == x.shape
+    if initial_states is not None:
+        assert initial_states.shape == (batch, nheads, headdim, dstate)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if dx is not None:
+        assert dx.shape == x.shape
+    if dB is not None:
+        assert dB.shape == B.shape
+        dB_given = dB
+    else:
+        dB_given = torch.empty_like(B)
+    if dC is not None:
+        assert dC.shape == C.shape
+        dC_given = dC
+    else:
+        dC_given = torch.empty_like(C)
+    if dz is not None:
+        assert z is not None
+        assert dz.shape == z.shape
+    if ddt is not None:
+        assert ddt.shape == dt.shape
+        ddt_given = ddt
+    else:
+        ddt_given = torch.empty_like(dt)
+    # TD: For some reason Triton (2.1.0 and 2.2.0) errors with
+    # "[CUDA]: invalid device context" (e.g. during varlne test), and cloning makes it work. Idk why.
+    dt_in = dt.clone()
+    dA_cumsum, dt = _chunk_cumsum_fwd(dt_in, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus,
+                                      dt_limit=dt_limit)
+    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
+    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
+    states, _ = _state_passing_fwd(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1],
+                                   initial_states=rearrange(initial_states, "... p n -> ... (p n)") if initial_states is not None else None,
+                                   seq_idx=seq_idx, chunk_size=chunk_size)
+    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
+    if z is not None:
+        dz, dout, dD, *rest = _chunk_scan_bwd_dz(x, z, out, dout, chunk_size=chunk_size, has_ddAcs=False, D=D, dz=dz, recompute_output=recompute_output)
+        outz = rest[0] if recompute_output else out
+    else:
+        dz = None
+        outz = out
+    dstates = _chunk_scan_bwd_dstates(C, dA_cumsum, dout, seq_idx=seq_idx, dtype=states.dtype)
+    # dstates has length nchunks, containing the gradient to initial states at index 0 and
+    # gradient to the states of chunk (nchunks - 2) at index (nchunks - 1)
+    # Do computation in fp32 but convert dstates and states to fp16/bf16 since dstates and states
+    # will be used in matmul in the next kernels.
+    dstates, ddA_chunk_cumsum, dinitial_states, states = _state_passing_bwd(
+        rearrange(states, "... p n -> ... (p n)"),
+        dA_cumsum[:, :, :, -1],
+        rearrange(dstates, "... p n -> ... (p n)"),
+        dfinal_states=rearrange(dfinal_states, "... p n -> ... (p n)") if dfinal_states is not None else None,
+        seq_idx=seq_idx,
+        has_initial_states=initial_states is not None,
+        dstates_dtype=x.dtype,
+        states_dtype=x.dtype,
+        chunk_size=chunk_size,
+    )
+    # dstates has length nchunks, containing the gradient to states of chunk 0 at index 0 and
+    # gradient to the final states at index (nchunks - 1)
+    # states has length nchunks, containing the initial states at index 0 and the state for chunk (nchunks - 2) at index (nchunks - 1)
+    # The final states is not stored.
+    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
+    dstates = rearrange(dstates, "... (p n) -> ... p n", n=dstate)
+    dinitial_states = rearrange(dinitial_states, "... (p n) -> ... p n", n=dstate) if dinitial_states is not None else None
+    dx, ddt, dD_from_x = _chunk_scan_chunk_state_bwd_dx(x, dt, dA_cumsum, B, CB, dout, dstates, D=D, seq_idx=seq_idx, dx=dx)
+    # dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, ngroups=ngroups)
+    dB, ddA_next = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, B=B, ngroups=ngroups)
+    # dC = _chunk_scan_bwd_dC(states[:, :-1].to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
+    dC, ddA_cumsum_prev = _chunk_scan_bwd_dC(states.to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, C=C, ngroups=ngroups)
+    # Computing ddA with the dcb kernel is much slower, so we're not using it for now
+    dCB = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
+    # dCB, ddA_tmp = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, CB=CB, ngroups=ngroups)
+    dCB = dCB.to(CB.dtype)
+    _bmm_chunk_bwd(C, dCB, residual=dB, out=dB_given)
+    _bmm_chunk_bwd(B, rearrange(dCB, "... l s -> ... s l"), residual=dC, out=dC_given)
+    # If we have z, then dout_x is recomputed in fp32 so dD = (dout_x * x).sum() is more accurate
+    # than dD_from_x = (dout_x * x).sum() where dout_x is in fp16/bf16
+    if z is None:
+        dD = dD_from_x
+    # Formula for ddA_cumsum, assuming out is the output of the forward pass before adding x * D.
+    # ddA_cumsum = torch.einsum("bclhp,bclhp->bhcl", out.float(), dout.float()) - ddt * dt
+    # However, this is numerically unstable: when we do the reverse cumsum on ddA_cumsum, there might
+    # be a lot of underflow.
+
+    # This is already done as part of bwd_dC kernel
+    # ddA_cumsum_prev = _chunk_scan_bwd_ddAcs_prev(states[:, :-1], C, dout, dA_cumsum, seq_idx=seq_idx)
+    ddA_cumsum_prev[..., -1] += ddA_chunk_cumsum
+    ddA_prev = ddA_cumsum_prev.flip([-1]).cumsum(dim=-1).flip([-1])
+    # This is already done as part of bwd_dB kernel
+    # ddA_next = _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=seq_idx)
+    # We don't need to pass in seq_idx because CB also zeros out entries where seq_idx[i] != seq_idx[j]
+    ddA = _chunk_scan_bwd_ddAcs_stable(x, dt, dA_cumsum, dout, CB)
+    ddA += ddA_next + ddA_prev
+
+    ddt_given, dA, ddt_bias = _chunk_cumsum_bwd(ddA, ddt, dt_in, A, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit, ddt=ddt_given)
+
+    # These 2 lines are just to test ddt and dA being computed by old code
+    # _, dA = selective_scan_bwd(dout, x, dt, A, B, C, D=D.float(), z=z)
+    # ddt_given.copy_(ddt)
+
+    return_vals = (dx, ddt_given, dA, dB_given, dC_given, dD, dz, ddt_bias, dinitial_states)
+    return return_vals if not recompute_output else (*return_vals, outz)
+
+
+def selective_scan_bwd(dout, x, dt, A, B, C, D=None, z=None):
+    """
+    Argument:
+        dout: (batch, seqlen, nheads, headdim)
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, nheads, nchunks, chunk_size) or (batch, nheads, headdim, nchunks, chunk_size)
+        A: (nheads) or (dim, dstate)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    import selective_scan
+
+    batch, seqlen, nheads, headdim = x.shape
+    chunk_size = dt.shape[-1]
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    x = rearrange(x, "b l h p -> b (h p) l")
+    squeeze_dt = dt.dim() == 4
+    if dt.dim() == 4:
+        dt = repeat(dt, "b h c l -> b h p c l", p=headdim)
+    dt = rearrange(dt, "b h p c l -> b (h p) (c l)", p=headdim)
+    squeeze_A = A.dim() == 1
+    if A.dim() == 1:
+        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
+    else:
+        A = A.to(dtype=torch.float32)
+    B = rearrange(B, "b l g n -> b g n l")
+    C = rearrange(C, "b l g n -> b g n l")
+    if D is not None:
+        if D.dim() == 2:
+            D = rearrange(D, "h p -> (h p)")
+        else:
+            D = repeat(D, "h -> (h p)", p=headdim)
+    if z is not None:
+        z = rearrange(z, "b l h p -> b (h p) l")
+
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    if dt.stride(-1) != 1:
+        dt = dt.contiguous()
+    if D is not None:
+        D = D.contiguous()
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if z is not None and z.stride(-1) != 1:
+        z = z.contiguous()
+    _, intermediate, *rest = selective_scan.fwd(x, dt.to(dtype=x.dtype), A, B, C, D, z, None, False)
+    if z is not None:
+        out = rest[0]
+    else:
+        out = None
+
+    dout = rearrange(dout, "b l h p -> b (h p) l")
+
+    if dout.stride(-1) != 1:
+        dout = dout.contiguous()
+    # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+    # backward of selective_scan with the backward of chunk).
+    # Here we just pass in None and dz will be allocated in the C++ code.
+    _, ddt, dA, *rest = selective_scan.bwd(
+        x, dt.to(dtype=x.dtype), A, B, C, D, z, None, dout, intermediate, out, None, False,
+        False  # option to recompute out_z, not used here
+    )
+    ddt = rearrange(ddt, "b (h p) (c l) -> b h p c l", p=headdim, l=chunk_size)
+    if squeeze_dt:
+        ddt = ddt.float().sum(dim=2)
+    if squeeze_A:
+        dA = rearrange(dA, "(h p) n -> h p n", p=headdim).sum(dim=(1, 2))
+    return ddt, dA
+
+
+class MambaChunkScanCombinedFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf")), return_final_states=False, return_varlen_states=False):
+        ctx.dt_dtype = dt.dtype
+        if not return_varlen_states:
+            cu_seqlens = None
+        else:
+            assert cu_seqlens is not None, "cu_seqlens must be provided if return_varlen_states is True"
+        out, out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, cu_seqlens=cu_seqlens, dt_softplus=dt_softplus, dt_limit=dt_limit)
+        ctx.save_for_backward(out if z is None else out_x, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx)
+        ctx.dt_softplus = dt_softplus
+        ctx.chunk_size = chunk_size
+        ctx.dt_limit = dt_limit
+        ctx.return_final_states = return_final_states
+        ctx.return_varlen_states = return_varlen_states
+        if not return_varlen_states:
+            return out if not return_final_states else (out, final_states)
+        else:
+            varlen_states = rest[0]
+            return (out, varlen_states) if not return_final_states else (out, final_states, varlen_states)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        out, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx = ctx.saved_tensors
+        assert not ctx.return_varlen_states, "return_varlen_states is not supported in backward"
+        dfinal_states = args[0] if ctx.return_final_states else None
+        dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = _mamba_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, ctx.chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=dfinal_states, seq_idx=seq_idx, dt_softplus=ctx.dt_softplus, dt_limit=ctx.dt_limit)
+        return dx, ddt, dA, dB, dC, None, dD, dz, ddt_bias, dinitial_states, None, None, None, None, None, None
+
+
+def mamba_chunk_scan_combined(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf")), return_final_states=False, return_varlen_states=False):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads)
+        A: (nheads)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        chunk_size: int
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        seq_idx: (batch, seqlen)
+        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
+        dt_softplus: Whether to apply softplus to dt
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    return MambaChunkScanCombinedFn.apply(x, dt, A, B, C, chunk_size, D, z, dt_bias, initial_states, seq_idx, cu_seqlens, dt_softplus, dt_limit, return_final_states, return_varlen_states)
+
+
+def mamba_chunk_scan(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads)
+        A: (nheads)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    batch, seqlen, nheads, headdim = x.shape
+    dstate = B.shape[-1]
+    if seqlen % chunk_size != 0:
+        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
+    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
+    dt = dt.float()  # We want high precision for this before cumsum
+    if dt_bias is not None:
+        dt = dt + rearrange(dt_bias, "h -> h 1 1")
+    if dt_softplus:
+        dt = F.softplus(dt)
+    dA = dt * rearrange(A, "h -> h 1 1")
+    dA = dt * rearrange(A, "h -> h 1 1")
+    dA_cumsum = torch.cumsum(dA, dim=-1)
+    # 1. Compute the state for each chunk
+    states = chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True)
+    # 2. Pass the state to all the chunks by weighted cumsum.
+    states = rearrange(state_passing(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1])[0],
+                       "... (p n) -> ... p n", n=dstate)
+    # 3. Compute the output for each chunk
+    out = chunk_scan(B, C, x, dt, dA_cumsum, states, D=D, z=z)
+    return out
+
+
+def ssd_chunk_scan_combined_ref(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads)
+        A: (nheads)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    batch, seqlen, nheads, headdim = x.shape
+    dstate = B.shape[-1]
+    if seqlen % chunk_size != 0:
+        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
+    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
+    dt = dt.float()  # We want high precision for this before cumsum
+    if dt_bias is not None:
+        dt = dt + rearrange(dt_bias, "h -> h 1 1")
+    if dt_softplus:
+        dt = F.softplus(dt)
+    dA = dt * rearrange(A, "h -> h 1 1")
+    dA_cumsum = torch.cumsum(dA, dim=-1)
+    # 1. Compute the state for each chunk
+    states = chunk_state_ref(B, x, dt, dA_cumsum)
+    states_dtype = states.dtype
+    if states.dtype not in [torch.float32, torch.float64]:
+        states = states.to(torch.float32)
+    # 2. Pass the state to all the chunks by weighted cumsum.
+    # state_passing_ref is much less numerically stable
+    states = rearrange(state_passing_ref(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1])[0],
+                       "... (p n) -> ... p n", n=dstate)
+    states = states.to(states_dtype)
+    # 3. Compute the output for each chunk
+    out = chunk_scan_ref(B, C, x, dt, dA_cumsum, states, D=D, z=z)
+    return out
+
+
+def ssd_selective_scan(x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
+        A: (nheads) or (dim, dstate)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,) or (nheads, headdim)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, ngroups, dstate = B.shape
+    x = rearrange(x, "b l h p -> b (h p) l")
+    if dt.dim() == 3:
+        dt = repeat(dt, "b l h -> b l h p", p=headdim)
+    dt = rearrange(dt, "b l h p -> b (h p) l")
+    if A.dim() == 1:
+        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
+    else:
+        A = A.to(dtype=torch.float32)
+    B = rearrange(B, "b l g n -> b g n l")
+    C = rearrange(C, "b l g n -> b g n l")
+    if D is not None:
+        if D.dim() == 2:
+            D = rearrange(D, "h p -> (h p)")
+        else:
+            D = repeat(D, "h -> (h p)", p=headdim)
+    if z is not None:
+        z = rearrange(z, "b l h p -> b (h p) l")
+    if dt_bias is not None:
+        if dt_bias.dim() == 1:
+            dt_bias = repeat(dt_bias, "h -> h p", p=headdim)
+        dt_bias = rearrange(dt_bias, "h p -> (h p)")
+    if dt_limit != (0.0, float("inf")):
+        if dt_bias is not None:
+            dt = dt + rearrange(dt_bias, "d -> d 1")
+        if dt_softplus:
+            dt = F.softplus(dt)
+        dt = dt.clamp(min=dt_limit[0], max=dt_limit[1]).to(x.dtype)
+        dt_bias = None
+        dt_softplus = None
+    out = selective_scan_fn(x, dt, A, B, C, D=D, z=z, delta_bias=dt_bias, delta_softplus=dt_softplus)
+    return rearrange(out, "b (h p) l -> b l h p", p=headdim)
+
+
+def mamba_conv1d_scan_ref(xBC, conv1d_weight, conv1d_bias, dt, A, chunk_size, D=None, z=None,
+                          dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf")),
+                          activation="silu", headdim=None, ngroups=1):
+    """
+    Argument:
+        xBC: (batch, seqlen, dim + 2 * ngroups * dstate) where dim == nheads * headdim
+        conv1d_weight: (dim + 2 * ngroups * dstate, width)
+        conv1d_bias: (dim + 2 * ngroups * dstate,)
+        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
+        A: (nheads)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, dim)
+        dt_bias: (nheads) or (nheads, headdim)
+        headdim: if D is 1D and z is None, headdim must be passed in
+    Return:
+        out: (batch, seqlen, dim)
+    """
+    batch, seqlen, nheads = dt.shape[:3]
+    assert nheads % ngroups == 0
+    if z is not None:
+        dim = z.shape[-1]
+        assert dim % nheads == 0
+        headdim = dim // nheads
+    else:
+        if D.dim() == 1:
+            assert headdim is not None
+        else:
+            headdim = D.shape[1]
+        dim = nheads * headdim
+    xBC = rearrange(causal_conv1d_fn(rearrange(xBC, "b s d -> b d s"), conv1d_weight, conv1d_bias, activation=activation),
+                    "b d s -> b s d")
+    dstate = (xBC.shape[-1] - dim) // ngroups // 2
+    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
+    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
+    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
+    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
+    z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
+    out = ssd_selective_scan(x, dt.to(x.dtype), A, B, C, D=D.float(), z=z, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit)
+    return rearrange(out, "b s h p -> b s (h p)")
+
+
+class MambaSplitConv1dScanCombinedFn(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states=None, seq_idx=None, dt_limit=(0.0, float("inf")), return_final_states=False, activation="silu",
+                rmsnorm_weight=None, rmsnorm_eps=1e-6, outproj_weight=None, outproj_bias=None, headdim=None,
+                ngroups=1, norm_before_gate=True):
+        assert activation in [None, "silu", "swish"]
+        if D.dim() == 1:
+            assert headdim is not None
+            nheads, = D.shape
+        else:
+            nheads, headdim = D.shape
+        batch, seqlen, _ = zxbcdt.shape
+        dim = nheads * headdim
+        assert nheads % ngroups == 0
+        dstate = (conv1d_weight.shape[0] - dim) // ngroups // 2
+        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ngroups * dstate - nheads) // 2
+        assert d_nonssm >= 0
+        assert zxbcdt.shape == (batch, seqlen, 2 * d_nonssm + 2 * dim + 2 * ngroups * dstate + nheads)
+        assert dt_bias.shape == (nheads,)
+        assert A.shape == (nheads,)
+        zx0, z, xBC, dt = torch.split(zxbcdt, [2 * d_nonssm, dim, dim + ngroups * dstate * 2, nheads], dim=-1)
+        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
+        xBC_conv = rearrange(
+            causal_conv1d_cuda.causal_conv1d_fwd(rearrange(xBC, "b s d -> b d s"),
+                                                 conv1d_weight, conv1d_bias, seq_idx, None, None, activation in ["silu", "swish"]),
+            "b d s -> b s d"
+        )
+        x, B, C = torch.split(xBC_conv, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
+        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
+        B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
+        C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
+        z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
+        if rmsnorm_weight is None:
+            out, out_x, dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit)
+            out = rearrange(out, "b s h p -> b s (h p)")
+            rstd = None
+            if d_nonssm > 0:
+                out = torch.cat([_swiglu_fwd(zx0), out], dim=-1)
+        else:
+            out_x, _, dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit)
+            # reshape input data into 2D tensor
+            x_rms = rearrange(out_x, "b s h p -> (b s) (h p)")
+            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
+            rmsnorm_weight = rmsnorm_weight.contiguous()
+            if d_nonssm == 0:
+                out = None
+            else:
+                out01 = torch.empty((batch, seqlen, d_nonssm + dim), dtype=x_rms.dtype, device=x_rms.device)
+                out = rearrange(out01[..., d_nonssm:], "b s d -> (b s) d")
+                _swiglu_fwd(zx0, out=out01[..., :d_nonssm])
+            out, _, rstd = _layer_norm_fwd(x_rms, rmsnorm_weight, None, rmsnorm_eps, z_rms, out=out,
+                                           group_size=dim // ngroups,
+                                           norm_before_gate=norm_before_gate, is_rms_norm=True)
+            if d_nonssm == 0:
+                out = rearrange(out, "(b s) d -> b s d", b=batch)
+            else:
+                out = out01
+        ctx.outproj_weight_dtype = outproj_weight.dtype if outproj_weight is not None else None
+        if outproj_weight is not None:
+            if torch.is_autocast_enabled():
+                dtype = torch.get_autocast_gpu_dtype()
+                out, outproj_weight = out.to(dtype), outproj_weight.to(dtype)
+                outproj_bias = outproj_bias.to(dtype) if outproj_bias is not None else None
+            out = F.linear(out, outproj_weight, outproj_bias)
+        else:
+            assert outproj_bias is None
+        ctx.save_for_backward(zxbcdt, conv1d_weight, conv1d_bias,
+                              out_x, A, D, dt_bias, initial_states, seq_idx, rmsnorm_weight, rstd, outproj_weight, outproj_bias)
+        ctx.dt_limit = dt_limit
+        ctx.return_final_states = return_final_states
+        ctx.activation = activation
+        ctx.rmsnorm_eps = rmsnorm_eps
+        ctx.norm_before_gate = norm_before_gate
+        ctx.chunk_size = chunk_size
+        ctx.headdim = headdim
+        ctx.ngroups = ngroups
+        return out if not return_final_states else (out, final_states)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout, *args):
+        zxbcdt, conv1d_weight, conv1d_bias, out, A, D, dt_bias, initial_states, seq_idx, rmsnorm_weight, rstd, outproj_weight, outproj_bias = ctx.saved_tensors
+        dfinal_states = args[0] if ctx.return_final_states else None
+        headdim = ctx.headdim
+        nheads = D.shape[0]
+        dim = nheads * headdim
+        assert nheads % ctx.ngroups == 0
+        dstate = (conv1d_weight.shape[0] - dim) // ctx.ngroups // 2
+        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ctx.ngroups * dstate - nheads) // 2
+        assert d_nonssm >= 0
+        recompute_output = outproj_weight is not None
+        if recompute_output:
+            out_recompute = torch.empty(*out.shape[:2], d_nonssm + dim, device=out.device, dtype=out.dtype)
+            out0_recompute, out1_recompute = out_recompute.split([d_nonssm, dim], dim=-1)
+        zx0, z, xBC, dt = torch.split(zxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1)
+        # Recompute x, B, C
+        xBC_conv = rearrange(
+            causal_conv1d_cuda.causal_conv1d_fwd(rearrange(xBC, "b s d -> b d s"),
+                                                 conv1d_weight, conv1d_bias, seq_idx, None, None, ctx.activation in ["silu", "swish"]),
+            "b d s -> b s d"
+        )
+        x, B, C = torch.split(xBC_conv, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1)
+        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
+        B = rearrange(B, "b l (g n) -> b l g n", g=ctx.ngroups)
+        C = rearrange(C, "b l (g n) -> b l g n", g=ctx.ngroups)
+        dzxbcdt = torch.empty_like(zxbcdt)
+        dzx0, dz, dxBC_given, ddt_given = torch.split(dzxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1)
+        dxBC = torch.empty_like(xBC)
+        dx, dB, dC = torch.split(dxBC, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1)
+        z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
+        dx = rearrange(dx, "b l (h p) -> b l h p", h=nheads)
+        dB = rearrange(dB, "b l (g n) -> b l g n", g=ctx.ngroups)
+        dC = rearrange(dC, "b l (g n) -> b l g n", g=ctx.ngroups)
+        if outproj_weight is not None:
+            dout_og = dout
+            dout = F.linear(dout, outproj_weight.t())
+        if d_nonssm > 0:
+            dout0, dout = dout.split([d_nonssm, dim], dim=-1)
+            _swiglu_bwd(zx0, dout0, dxy=dzx0, recompute_output=True, out=out0_recompute)
+        dout = rearrange(dout, "b s (h p) -> b s h p", p=headdim)
+        if rmsnorm_weight is None:
+            dz = rearrange(dz, "b l (h p) -> b l h p", h=nheads)
+            dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states, *rest = _mamba_chunk_scan_combined_bwd(
+                dout, x, dt, A, B, C, out, ctx.chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=dfinal_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=ctx.dt_limit, dx=dx, ddt=ddt_given, dB=dB, dC=dC, dz=dz, recompute_output=recompute_output
+            )
+            out_for_linear = rearrange(rest[0], "b s h p -> b s (h p)") if recompute_output else None
+            drmsnorm_weight = None
+        else:
+            batch = dout.shape[0]
+            dy_rms = rearrange(dout, "b s h p -> (b s) (h p)")
+            dz = rearrange(dz, "b l d -> (b l) d")
+            x_rms = rearrange(out, "b s h p -> (b s) (h p)")
+            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
+            out1_recompute = rearrange(out1_recompute, "b s d -> (b s) d") if recompute_output else None
+            dout, drmsnorm_weight, _, dz, *rest = _layer_norm_bwd(dy_rms, x_rms, rmsnorm_weight, None, ctx.rmsnorm_eps, None, rstd, z_rms, norm_before_gate=ctx.norm_before_gate, is_rms_norm=True, recompute_output=recompute_output, dz=dz, out=out1_recompute if recompute_output else None)
+            out_for_linear = out_recompute if recompute_output else None
+            dout = rearrange(dout, "(b s) (h p) -> b s h p", b=batch, p=headdim)
+            dx, ddt, dA, dB, dC, dD, _, ddt_bias, dinitial_states = _mamba_chunk_scan_combined_bwd(
+                dout, x, dt, A, B, C, out, ctx.chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=dfinal_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=ctx.dt_limit, dx=dx, ddt=ddt_given, dB=dB, dC=dC
+            )
+
+        if outproj_weight is not None:
+            doutproj_weight = torch.einsum("bso,bsd->od", dout_og, out_for_linear)
+            doutproj_bias = dout_og.sum(dim=(0, 1)) if outproj_bias is not None else None
+        else:
+            doutproj_weight, doutproj_bias = None, None
+        dxBC_given = rearrange(dxBC_given, "b s d -> b d s")
+        dxBC_given, dweight, dbias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
+            rearrange(xBC, "b s d -> b d s"), conv1d_weight, conv1d_bias,
+            rearrange(dxBC, "b s d -> b d s"), seq_idx, None, None, dxBC_given, False, ctx.activation in ["silu", "swish"]
+        )
+        dxBC_given = rearrange(dxBC_given, "b d s -> b s d")
+        return dzxbcdt, dweight, dbias, ddt_bias, dA, dD, None, dinitial_states, None, None, None, None, drmsnorm_weight, None, doutproj_weight, doutproj_bias, None, None, None
+
+
+def mamba_split_conv1d_scan_combined(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states=None, seq_idx=None, dt_limit=(0.0, float("inf")), return_final_states=False, activation="silu", rmsnorm_weight=None, rmsnorm_eps=1e-6, outproj_weight=None, outproj_bias=None, headdim=None, ngroups=1, norm_before_gate=True):
+    """
+    Argument:
+        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
+        conv1d_weight: (dim + 2 * ngroups * dstate, width)
+        conv1d_bias: (dim + 2 * ngroups * dstate,)
+        dt_bias: (nheads,)
+        A: (nheads)
+        D: (nheads, headdim) or (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        seq_idx: (batch, seqlen), int32
+        rmsnorm_weight: (dim,)
+        outproj_weight: (out_dim, dim)
+        outproj_bias: (out_dim,)
+        headdim: if D is 1D, headdim must be passed in
+        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
+    Return:
+        out: (batch, seqlen, dim)
+    """
+    return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate)
+
+
+def mamba_split_conv1d_scan_ref(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, dt_limit=(0.0, float("inf")), activation="silu", rmsnorm_weight=None, rmsnorm_eps=1e-6, outproj_weight=None, outproj_bias=None, headdim=None, ngroups=1, norm_before_gate=True):
+    """
+    Argument:
+        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
+        conv1d_weight: (dim + 2 * ngroups * dstate, width)
+        conv1d_bias: (dim + 2 * ngroups * dstate,)
+        dt_bias: (nheads,)
+        A: (nheads)
+        D: (nheads, headdim) or (nheads,)
+        rmsnorm_weight: (dim,)
+        outproj_weight: (out_dim, dim)
+        outproj_bias: (out_dim,)
+        headdim: if D is 1D, headdim must be passed in
+        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
+    Return:
+        out: (batch, seqlen, dim)
+    """
+    if D.dim() == 1:
+        assert headdim is not None
+        nheads, = D.shape
+    else:
+        nheads, headdim = D.shape
+    assert nheads % ngroups == 0
+    batch, seqlen, _ = zxbcdt.shape
+    dim = nheads * headdim
+    dstate = (zxbcdt.shape[-1] - 2 * dim - nheads) // ngroups // 2
+    assert zxbcdt.shape == (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads)
+    assert dt_bias.shape == (nheads,)
+    assert A.shape == (nheads,)
+    if rmsnorm_weight is not None:
+        assert rmsnorm_weight.shape == (dim,)
+    z, xBC, dt = torch.split(zxbcdt, [dim, dim + 2 * ngroups * dstate, nheads], dim=-1)
+    xBC = rearrange(causal_conv1d_fn(rearrange(xBC, "b s d -> b d s"), conv1d_weight, conv1d_bias, activation=activation),
+                    "b d s -> b s d")
+    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
+    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
+    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
+    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
+    z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
+    out = ssd_selective_scan(x, dt.to(x.dtype), A, B, C, D=D.float(),
+                             z=z if rmsnorm_weight is None else None, dt_bias=dt_bias, dt_softplus=True, dt_limit=dt_limit)
+    out = rearrange(out, "b s h p -> b s (h p)")
+    if rmsnorm_weight is not None:
+        out = rmsnorm_fn(out, rmsnorm_weight, None, z=rearrange(z, "b l h p -> b l (h p)"), eps=rmsnorm_eps,
+                         norm_before_gate=norm_before_gate)
+    if outproj_weight is not None:
+        out = F.linear(out, outproj_weight, outproj_bias)
+    return out
+
--- a/mamba_ssm/ops/triton/ssd_state_passing.py
+++ b/mamba_ssm/ops/triton/ssd_state_passing.py
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+
+"""We want triton==2.1.0 or 2.2.0 for this
+"""
+
+import math
+import torch
+import torch.nn.functional as F
+
+import triton
+import triton.language as tl
+
+from einops import rearrange, repeat
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE': 64}),
+        triton.Config({'BLOCK_SIZE': 128}),
+        triton.Config({'BLOCK_SIZE': 256}),
+        triton.Config({'BLOCK_SIZE': 512}),
+        triton.Config({'BLOCK_SIZE': 1024}),
+        triton.Config({'BLOCK_SIZE': 2048}),
+    ],
+    key=['dim'],
+)
+@triton.jit
+def _state_passing_fwd_kernel(
+    # Pointers to matrices
+    states_ptr, out_ptr, final_states_ptr, dA_cs_ptr, initstates_ptr, seq_idx_ptr,
+    # Matrix dimensions
+    dim, nchunks, seqlen, chunk_size,
+    # Strides
+    stride_states_batch, stride_states_chunk, stride_states_head, stride_states_dim,
+    stride_out_batch, stride_out_chunk, stride_out_head, stride_out_dim,
+    stride_final_states_batch, stride_final_states_head, stride_final_states_dim,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head,
+    stride_initstates_batch, stride_initstates_head, stride_initstates_dim,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_INITSTATES: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+    final_states_ptr += pid_b * stride_final_states_batch + pid_h * stride_final_states_head
+    if HAS_INITSTATES:
+        initstates_ptr += pid_b * stride_initstates_batch + pid_h * stride_initstates_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch
+
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    states_ptrs = states_ptr + offs_m * stride_states_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    final_states_ptrs = final_states_ptr + offs_m * stride_final_states_dim
+
+    if not HAS_INITSTATES:
+        states = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
+    else:
+        initstates_ptrs = initstates_ptr + offs_m * stride_initstates_dim
+        states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    tl.store(out_ptrs, states, mask=offs_m < dim)
+    out_ptrs += stride_out_chunk
+    seq_idx = 0
+    for c in range(nchunks):
+        new_states = tl.load(states_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
+        scale = tl.exp(dA_cs)
+        if HAS_SEQ_IDX:
+            seq_idx_new = tl.load(seq_idx_ptr + (min((c + 1) * chunk_size, seqlen) - 1) * stride_seq_idx_seqlen)
+            scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
+            seq_idx = seq_idx_new
+        states = scale * states + new_states
+        if c < nchunks - 1:
+            tl.store(out_ptrs, states, mask=offs_m < dim)
+        else:
+            tl.store(final_states_ptrs, states, mask=offs_m < dim)
+        states_ptrs += stride_states_chunk
+        dA_cs_ptr += stride_dA_cs_chunk
+        out_ptrs += stride_out_chunk
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE': 64}),
+        triton.Config({'BLOCK_SIZE': 128}),
+        triton.Config({'BLOCK_SIZE': 256}),
+        triton.Config({'BLOCK_SIZE': 512}),
+        triton.Config({'BLOCK_SIZE': 1024}),
+        triton.Config({'BLOCK_SIZE': 2048}),
+    ],
+    key=['dim'],
+)
+@triton.jit
+def _state_passing_bwd_kernel(
+    # Pointers to matrices
+    dout_ptr, out_ptr, dA_cs_ptr, dfinal_states_ptr, seq_idx_ptr,
+    dstates_ptr, ddA_cs_ptr, dinitstates_ptr, states_converted_ptr,
+    # Matrix dimensions
+    dim, nchunks, seqlen, chunk_size,
+    # Strides
+    stride_dout_batch, stride_dout_chunk, stride_dout_head, stride_dout_dim,
+    stride_out_batch, stride_out_chunk, stride_out_head, stride_out_dim,
+    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head,
+    stride_dfinal_states_batch, stride_dfinal_states_head, stride_dfinal_states_dim,
+    stride_seq_idx_batch, stride_seq_idx_seqlen,
+    stride_dstates_batch, stride_dstates_chunk, stride_dstates_head, stride_dstates_dim,
+    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head,
+    stride_dinitstates_batch, stride_dinitstates_head, stride_dinitstates_dim,
+    # Meta-parameters
+    CONVERT_STATES: tl.constexpr,
+    HAS_DFINAL_STATES: tl.constexpr,
+    HAS_DINITSTATES: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+    dstates_ptr += pid_b * stride_dstates_batch + pid_h * stride_dstates_head + (nchunks - 1) * stride_dstates_chunk
+    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + (nchunks - 1) * stride_dA_cs_chunk
+    ddA_cs_ptr += pid_b * stride_ddA_cs_batch + pid_h * stride_ddA_cs_head + (nchunks - 1) * stride_ddA_cs_chunk + pid_m
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head + (nchunks - 1) * stride_out_chunk
+    dout_ptr += pid_b * stride_dout_batch + pid_h * stride_dout_head + (nchunks - 1) * stride_dout_chunk
+    if CONVERT_STATES:
+        states_converted_ptr += pid_b * stride_out_batch + pid_h * stride_out_head + (nchunks - 1) * stride_out_chunk
+    if HAS_DFINAL_STATES:
+        dfinal_states_ptr += pid_b * stride_dfinal_states_batch + pid_h * stride_dfinal_states_head
+    if HAS_DINITSTATES:
+        dinitstates_ptr += pid_b * stride_dinitstates_batch + pid_h * stride_dinitstates_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch
+
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    dstates_ptrs = dstates_ptr + offs_m * stride_dstates_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    dout_ptrs = dout_ptr + offs_m * stride_dout_dim
+    if CONVERT_STATES:
+        states_converted_ptrs = states_converted_ptr + offs_m * stride_out_dim
+
+    if HAS_DFINAL_STATES:
+        dstates = tl.load(dfinal_states_ptr + offs_m * stride_dfinal_states_dim, mask=offs_m < dim, other=0.0).to(tl.float32)
+    else:
+        dstates = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
+    tl.store(dstates_ptrs, dstates, mask=offs_m < dim)
+    if HAS_SEQ_IDX:
+        seq_idx = tl.load(seq_idx_ptr + (seqlen - 1) * stride_seq_idx_seqlen)
+    dstates_ptrs -= stride_dstates_chunk
+    for c in range(nchunks - 1):
+        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
+        scale = tl.exp(dA_cs)
+        if HAS_SEQ_IDX:
+            seq_idx_new = tl.load(seq_idx_ptr + (((nchunks - c - 1) * chunk_size - 1) * stride_seq_idx_seqlen))
+            scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
+            seq_idx = seq_idx_new
+        out = tl.load(out_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if CONVERT_STATES:
+            tl.store(states_converted_ptrs, out, mask=offs_m < dim)
+        ddA = tl.sum(out * dstates) * scale
+        tl.store(ddA_cs_ptr, ddA)
+        dout = tl.load(dout_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        dstates = scale * dstates + dout
+        tl.store(dstates_ptrs, dstates, mask=offs_m < dim)
+        dout_ptrs -= stride_dout_chunk
+        dstates_ptrs -= stride_dstates_chunk
+        dA_cs_ptr -= stride_dA_cs_chunk
+        ddA_cs_ptr -= stride_ddA_cs_chunk
+        out_ptrs -= stride_out_chunk
+        if CONVERT_STATES:
+            states_converted_ptrs -= stride_out_chunk
+    if CONVERT_STATES:
+        out = tl.load(out_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        tl.store(states_converted_ptrs, out, mask=offs_m < dim)
+    if not HAS_DINITSTATES:
+        tl.store(ddA_cs_ptr, 0.0)
+    else:
+        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
+        scale = tl.exp(dA_cs)
+        if HAS_SEQ_IDX:
+            scale = tl.where(seq_idx == 0, scale, 0.0)
+        out = tl.load(out_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        ddA = tl.sum(out * dstates) * scale
+        tl.store(ddA_cs_ptr, ddA)
+        dout = tl.load(dout_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        dstates = scale * dstates + dout
+        tl.store(dinitstates_ptr + offs_m * stride_dinitstates_dim, dstates, mask=offs_m < dim)
+
+
+def _state_passing_fwd(states, dA_chunk_cumsum, initial_states=None, seq_idx=None, chunk_size=None,
+                       out_dtype=None):
+    batch, nchunks, nheads, dim = states.shape
+    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+    if initial_states is not None:
+        assert initial_states.shape == (batch, nheads, dim)
+    if seq_idx is not None:
+        assert chunk_size is not None
+        seqlen = seq_idx.shape[-1]
+        assert seq_idx.shape == (batch, seqlen)
+    out_dtype = states.dtype if out_dtype is None else out_dtype
+    out = torch.empty((batch, nchunks, nheads, dim), device=states.device, dtype=out_dtype)
+    final_states = torch.empty((batch, nheads, dim), device=states.device, dtype=torch.float32)
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), batch, nheads)
+    with torch.cuda.device(states.device.index):
+        _state_passing_fwd_kernel[grid](
+            states, out, final_states, dA_chunk_cumsum, initial_states, seq_idx,
+            dim, nchunks, seqlen if seq_idx is not None else 0, chunk_size if seq_idx is not None else 0,
+            states.stride(0), states.stride(1), states.stride(2), states.stride(3),
+            out.stride(0), out.stride(1), out.stride(2), out.stride(3),
+            final_states.stride(0), final_states.stride(1), final_states.stride(2),
+            dA_chunk_cumsum.stride(0), dA_chunk_cumsum.stride(2), dA_chunk_cumsum.stride(1),
+            *((initial_states.stride(0), initial_states.stride(1), initial_states.stride(2))
+              if initial_states is not None else (0, 0, 0)),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            HAS_INITSTATES=initial_states is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return out, final_states
+
+
+def _state_passing_bwd(
+        states, dA_chunk_cumsum, dout, dfinal_states=None, seq_idx=None, has_initial_states=None,
+        dstates_dtype=None, states_dtype=None, chunk_size=None
+):
+    """
+    states contains the initial_states at index 0. The final states are not included in states.
+    """
+    batch, nchunks, nheads, dim = states.shape
+    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+    assert dout.shape == (batch, nchunks, nheads, dim)
+    if seq_idx is not None:
+        assert chunk_size is not None
+        seqlen = seq_idx.shape[-1]
+        assert seq_idx.shape == (batch, seqlen)
+    dstates = torch.empty_like(dout, dtype=dstates_dtype if dstates_dtype is not None else dout.dtype)
+    if states_dtype is not None and states_dtype != states.dtype:
+        states_converted = torch.empty_like(states, dtype=dstates_dtype if dstates_dtype is not None else dout.dtype)
+        assert states_converted.stride() == states.stride()
+    else:
+        states_converted = None
+    if has_initial_states:
+        dinitstates = torch.empty_like(dstates[:, 0])
+    else:
+        dinitstates = None
+    if dfinal_states is not None:
+        assert dfinal_states.shape == (batch, nheads, dim)
+    BLOCK_SIZE_min = 64
+    n_blocks = (dim + BLOCK_SIZE_min - 1) // BLOCK_SIZE_min
+    ddA_chunk_cumsum = torch.empty(batch, nheads, nchunks, n_blocks,
+                                    dtype=torch.float32, device=dA_chunk_cumsum.device)
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), batch, nheads)
+    with torch.cuda.device(dout.device.index):
+        _state_passing_bwd_kernel[grid](
+            dout, states, dA_chunk_cumsum, dfinal_states, seq_idx,
+            dstates, ddA_chunk_cumsum, dinitstates, states_converted,
+            dim, nchunks, seqlen if seq_idx is not None else 0, chunk_size if seq_idx is not None else 0,
+            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
+            states.stride(0), states.stride(1), states.stride(2), states.stride(3),
+            dA_chunk_cumsum.stride(0), dA_chunk_cumsum.stride(2), dA_chunk_cumsum.stride(1),
+            *((dfinal_states.stride(0), dfinal_states.stride(1), dfinal_states.stride(2))
+                if dfinal_states is not None else (0, 0, 0)),
+            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3),
+            ddA_chunk_cumsum.stride(0), ddA_chunk_cumsum.stride(2), ddA_chunk_cumsum.stride(1),
+            *((dinitstates.stride(0), dinitstates.stride(1), dinitstates.stride(2))
+              if dinitstates is not None else (0, 0, 0)),
+            CONVERT_STATES=states_converted is not None,
+            HAS_DFINAL_STATES=dfinal_states is not None,
+            HAS_DINITSTATES=dinitstates is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    BLOCK_SIZE_actual = _state_passing_bwd_kernel.best_config.kwargs["BLOCK_SIZE"]
+    n_valid_blocks = (dim + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
+    ddA_chunk_cumsum = ddA_chunk_cumsum[..., :n_valid_blocks].sum(dim=-1).to(dtype=dA_chunk_cumsum.dtype)
+    if states_dtype is not None and states_dtype == states.dtype:
+        states_converted = states
+    return (dstates, ddA_chunk_cumsum, dinitstates) if states_dtype is None else (dstates, ddA_chunk_cumsum, dinitstates, states_converted)
+
+
+class StatePassingFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, states, dA_chunk_cumsum, initial_states=None):
+        batch, nchunks, nheads, dim = states.shape
+        assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+        if states.stride(-1) != 1:
+            states = states.contiguous()
+        out, final_states = _state_passing_fwd(states, dA_chunk_cumsum, initial_states)
+        ctx.save_for_backward(out, dA_chunk_cumsum)
+        ctx.has_initial_states = initial_states is not None
+        return out, final_states
+
+    @staticmethod
+    def backward(ctx, dout, dfinal_states):
+        out, dA_chunk_cumsum = ctx.saved_tensors
+        batch, nchunks, nheads, dim = out.shape
+        assert dout.shape == (batch, nchunks, nheads, dim)
+        assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+        assert dfinal_states.shape == (batch, nheads, dim)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        dstates, ddA_chunk_cumsum, dinitstates = _state_passing_bwd(
+            out, dA_chunk_cumsum, dout, dfinal_states=dfinal_states , has_initial_states=ctx.has_initial_states
+        )
+        return dstates, ddA_chunk_cumsum, dinitstates
+
+
+def state_passing(states, dA_chunk_cumsum, initial_states=None):
+    """
+    Argument:
+        states: (batch, nchunks, nheads, dim)
+        dA_chunk_cumsum: (batch, nheads, nchunks)
+        initial_states: (batch, nheads, dim)
+    Return:
+        out: (batch, nchunks, nheads, dim)
+        final_states: (batch, nheads, dim)
+    """
+    return StatePassingFn.apply(states, dA_chunk_cumsum, initial_states)
+
+
+def state_passing_ref(states, dA_chunk_cumsum, initial_states=None):
+    """
+    Argument:
+        states: (batch, nchunks, nheads, dim)
+        dA_chunk_cumsum: (batch, nheads, nchunks)
+        initial_states: (batch, nheads, dim)
+    Return:
+        out: (batch, nchunks, nheads, dim)
+        final_states: (batch, nheads, dim)
+    """
+    if initial_states is None:
+        initial_states = torch.zeros_like(states[:, 0])
+    states = torch.cat([rearrange(initial_states, "b h d -> b 1 h d"), states], dim=1)
+    dA_chunk_cumsum = F.pad(dA_chunk_cumsum, (1, 0))
+    dA_chunk_cumsum = torch.cumsum(dA_chunk_cumsum, dim=-1)
+    nchunks = dA_chunk_cumsum.shape[-1]
+    # (batch, nheads, nchunks, nchunks)
+    dt_chunk_segment_sum = dA_chunk_cumsum[:, :, :, None] - dA_chunk_cumsum[:, :, None, :]
+    # (batch, nheads, nchunks, nchunks)
+    decay_chunk = torch.exp(dt_chunk_segment_sum)
+    causal_mask = torch.tril(torch.ones(nchunks, nchunks, device=states.device, dtype=bool), diagonal=0)
+    decay_chunk = decay_chunk.masked_fill(~causal_mask, 0)
+    out = torch.einsum("bhzc,bchd->bzhd", decay_chunk.to(dtype=states.dtype), states)
+    return out[:, :-1], out[:, -1]
--- a/mamba_ssm/utils/__init__.py
+++ b/mamba_ssm/utils/__init__.py
--- a/mamba_ssm/utils/generation.py
+++ b/mamba_ssm/utils/generation.py
+# Copyright (c) 2023, Albert Gu, Tri Dao.
+import gc
+import time
+from collections import namedtuple
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Callable, Optional, Sequence, Union
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.profiler import ProfilerActivity, profile, record_function
+from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput, TextStreamer
+
+
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+
+
+def modify_logits_for_min_p_filtering(logits, min_p):
+    """Set the logits for none min_p values to -inf. Done in-place."""
+    if min_p <= 0.0 or min_p >= 1.0:
+        return
+    indices_to_remove = logits < min_p
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf. Done in-place."""
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+
+
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf. Done in-place."""
+    if top_p <= 0.0 or top_p >= 1.0:
+        return
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        1, sorted_indices, sorted_indices_to_remove
+    )
+    logits.masked_fill_(indices_to_remove, float("-inf"))
+
+
+def modify_logit_for_repetition_penalty(logits, prev_output_tokens, repetition_penalty=1.0):
+    """Apply repetition penalty. See https://arxiv.org/abs/1909.05858
+    logits: (batch_size, vocab_size)
+    prev_output_tokens: (batch_size, seq_len)
+    """
+    if repetition_penalty == 1.0:
+        return logits
+    score = torch.gather(logits, 1, prev_output_tokens)
+    # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+    score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty)
+    logits.scatter_(1, prev_output_tokens, score)
+    return logits
+
+
+def sample(logits, top_k=1, top_p=0.0, min_p=0.0, temperature=1.0):
+    """Sample from top-k logits.
+    Arguments:
+        logits: Tensor of shape (batch_size, vocab_size)
+    """
+    if top_k == 1:  # Short-circuit for greedy decoding
+        return logits.argmax(dim=-1)
+    else:
+        if top_p > 0.0:
+            assert top_p <= 1.0, "top-p should be in (0, 1]."
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))  # Safety check
+            logits_top, indices = torch.topk(logits, top_k, dim=-1)
+            if temperature != 1.0:
+                logits_top /= temperature
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return indices[
+                torch.arange(indices.shape[0], device=indices.device),
+                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
+            ]
+        else:
+            if min_p > 0.0:
+                logits_top = logits.clone()
+                max_prob = logits_top[..., 0].item()
+                min_prob = max_prob * min_p
+                modify_logits_for_min_p_filtering(logits_top, min_prob)
+                if temperature != 1.0:
+                    logits_top /= temperature
+                return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1)
+            # Clone so that when we modify for top_p we don't change the original logits
+            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(
+                dim=-1
+            )
+
+
+@torch.inference_mode()
+def decode(
+    input_ids,
+    model,
+    max_length,
+    top_k=1,
+    top_p=0.0,
+    min_p=0.0,
+    temperature=1.0,
+    repetition_penalty=1.0,
+    eos_token_id=None,
+    teacher_outputs=None,
+    vocab_size=None,
+    cg=False,
+    enable_timing=False,
+    streamer: Optional[TextStreamer] = None
+):
+    """Decoding, either greedy or with top-k or top-p sampling.
+    If top-k = 0, don't limit the number of candidates (pure sampling).
+    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
+    then top-p.
+    We assume that all sequences in the same batch have the same length.
+
+    Arguments:
+        input_ids: (batch, seq_len)
+        max_length: int
+        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
+            logits, the next token is taken from the teacher_outputs. Useful for testing.
+    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
+        sequences: (batch, max_length)
+        scores: tuples of (batch, vocab_size)
+    """
+    if streamer is not None:
+        streamer.put(input_ids.cpu())
+
+    batch_size, seqlen_og = input_ids.shape
+    teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0
+    if cg:
+        if not hasattr(model, "_decoding_cache"):
+            model._decoding_cache = None
+        model._decoding_cache = update_graph_cache(
+            model,
+            model._decoding_cache,
+            batch_size,
+            seqlen_og,
+            max_length,
+        )
+        inference_params = model._decoding_cache.inference_params
+        inference_params.reset(max_length, batch_size)
+    else:
+        inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
+
+    def get_logits(input_ids, inference_params):
+        decoding = inference_params.seqlen_offset > 0
+        if decoding:
+            position_ids = torch.full(
+                (batch_size, 1),
+                inference_params.seqlen_offset,
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        else:
+            position_ids = None
+        if not cg or not decoding:
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=1,
+            ).logits.squeeze(dim=1)
+        else:
+            logits = model._decoding_cache.run(
+                input_ids, position_ids, inference_params.seqlen_offset
+            ).squeeze(dim=1)
+        return logits[..., :vocab_size] if vocab_size is not None else logits
+
+    def sample_tokens(logits, inference_params):
+        if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset:
+            token = sample(logits, top_k=top_k, top_p=top_p, min_p=min_p, temperature=temperature)
+        else:
+            token = teacher_outputs[:, inference_params.seqlen_offset]
+        # return rearrange(token, "b -> b 1")
+        return token.unsqueeze(1)
+
+    def should_stop(current_token, inference_params):
+        if inference_params.seqlen_offset == 0:
+            return False
+        if eos_token_id is not None and (current_token == eos_token_id).all():
+            return True
+        if inference_params.seqlen_offset >= max_length - 1:
+            return True
+        return False
+
+    start = torch.cuda.Event(enable_timing=enable_timing)
+    end = torch.cuda.Event(enable_timing=enable_timing)
+
+    if enable_timing:
+        start.record()
+    scores, sequences = [], [input_ids]
+    sequences_cat = input_ids
+    while not should_stop(sequences[-1], inference_params):
+        scores.append(get_logits(sequences[-1], inference_params))
+        inference_params.seqlen_offset += sequences[-1].shape[1]
+        if repetition_penalty == 1.0:
+            sampled_tokens = sample_tokens(scores[-1], inference_params)
+        else:
+            logits = modify_logit_for_repetition_penalty(
+                scores[-1].clone(), sequences_cat, repetition_penalty
+            )
+            sampled_tokens = sample_tokens(logits, inference_params)
+            sequences_cat = torch.cat([sequences_cat, sampled_tokens], dim=1)
+        sequences.append(sampled_tokens)
+        if streamer is not None:
+            streamer.put(sampled_tokens.cpu())
+    if streamer is not None:
+        streamer.end()
+    if enable_timing:
+        end.record()
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms")
+    output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
+    return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores))
+
+
+class GenerationMixin:
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        raise NotImplementedError
+
+    def generate(
+        self,
+        input_ids,
+        max_length,
+        top_k=1,
+        top_p=0.0,
+        min_p=0.0,
+        temperature=1.0,
+        return_dict_in_generate=False,
+        output_scores=False,
+        **kwargs,
+    ):
+        output = decode(
+            input_ids, self, max_length, top_k=top_k, top_p=top_p, min_p = min_p, temperature=temperature, **kwargs
+        )
+        if not output_scores:
+            output.scores = None
+        return output if return_dict_in_generate else output.sequences
+
+
+@dataclass
+class DecodingCGCache:
+    max_batch_size: int = 0
+    max_seqlen: int = 0
+    device = None
+    dtype = None
+    callables: dict = field(default_factory=dict)
+    mempool = None
+    inference_params: Optional[InferenceParams] = None
+    run: Optional[Callable] = None
+
+
+@torch.inference_mode()
+def update_graph_cache(
+    model,
+    cache,
+    batch_size,
+    seqlen_og,
+    max_seqlen,
+    decoding_seqlens=(1,),
+    dtype=None,
+    n_warmups=2,
+):
+    if cache is None:
+        cache = DecodingCGCache()
+    param_example = next(iter(model.parameters()))
+    device = param_example.device
+    if dtype is None:
+        dtype = param_example.dtype
+    if (
+        (device, dtype) != (cache.device, cache.dtype)
+        or batch_size > cache.max_batch_size
+        or max_seqlen > cache.max_seqlen
+    ):  # Invalidate the cache
+        cache.callables = {}
+        cache.mempool = None
+        cache.inference_params = None
+        gc.collect()
+        cache.device, cache.dtype = device, dtype
+        cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen
+        assert hasattr(model, "allocate_inference_cache"), "CUDA graph decoding requires that the model has a method allocate_inference_cache"
+        inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
+        lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device)
+        cache.inference_params = InferenceParams(
+            max_seqlen=max_seqlen,
+            max_batch_size=batch_size,
+            seqlen_offset=seqlen_og,
+            key_value_memory_dict=inf_cache,
+            lengths_per_sample=lengths_per_sample,
+        )
+        cache.mempool = torch.cuda.graphs.graph_pool_handle()
+    for decoding_seqlen in decoding_seqlens:
+        if (batch_size, decoding_seqlen) not in cache.callables:
+            cache.callables[batch_size, decoding_seqlen] = capture_graph(
+                model,
+                cache.inference_params,
+                batch_size,
+                max_seqlen,
+                decoding_seqlen=decoding_seqlen,
+                mempool=cache.mempool,
+                n_warmups=n_warmups,
+            )
+
+    def dispatch(input_ids, position_ids, seqlen):
+        batch_size, decoding_seqlen = input_ids.shape[:2]
+        return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen)
+
+    cache.run = dispatch
+    cache.inference_params.seqlen_offset = 0  # Reset so it's not confusing
+    return cache
+
+
+def capture_graph(
+    model, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2
+):
+    device = next(iter(model.parameters())).device
+    input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    seqlen_offset_og = inference_params.seqlen_offset
+    inference_params.seqlen_offset = max_seqlen - decoding_seqlen
+    inference_params.lengths_per_sample[:] = inference_params.seqlen_offset
+
+    # Warmup before capture
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(n_warmups):
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=decoding_seqlen,
+            ).logits
+        s.synchronize()
+        # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0,
+        # which requires that graph launch and non-captured launch to not overlap (I think,
+        # that's how I interpret the documentation). I'm not sure if this is required.
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+    torch.cuda.current_stream().wait_stream(s)
+    # Captures the graph
+    # To allow capture, automatically sets a side stream as the current stream in the context
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, pool=mempool):
+        logits = model(
+            input_ids,
+            position_ids=position_ids,
+            inference_params=inference_params,
+            num_last_tokens=decoding_seqlen,
+        ).logits
+
+    def run(new_input_ids, new_position_ids, seqlen):
+        inference_params.lengths_per_sample[:] = seqlen
+        input_ids.copy_(new_input_ids)
+        position_ids.copy_(new_position_ids)
+        graph.replay()
+        return logits.clone()
+
+    inference_params.seqlen_offset = seqlen_offset_og
+    return run
--- a/mamba_ssm/utils/hf.py
+++ b/mamba_ssm/utils/hf.py
+import json
+
+import torch
+
+from transformers.utils import WEIGHTS_NAME, CONFIG_NAME
+from transformers.utils.hub import cached_file
+
+
+def load_config_hf(model_name):
+    resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False)
+    return json.load(open(resolved_archive_file))
+
+
+def load_state_dict_hf(model_name, device=None, dtype=None):
+    # If not fp32, then we don't want to load directly to the GPU
+    mapped_device = "cpu" if dtype not in [torch.float32, None] else device
+    resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False)
+    return torch.load(resolved_archive_file, map_location=mapped_device)
+    # Convert dtype before moving to GPU to save memory
+    if dtype is not None:
+        state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
+    state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
+    return state_dict
--- a/rocm_patch/rocm6_0.patch
+++ b/rocm_patch/rocm6_0.patch
+--- /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h	2023-12-12 20:11:48.000000000 +0000
+++ rocm_update_files/amd_hip_bf16.h	2024-05-20 17:40:26.983349079 +0000
+@@ -137,7 +137,7 @@
+  * \ingroup HIP_INTRINSIC_BFLOAT16_CONV
+  * \brief Converts float to bfloat16
+  */
+-__HOST_DEVICE__ __hip_bfloat16 __float2bfloat16(float f) {
+__HOST_DEVICE__ static inline __hip_bfloat16 __float2bfloat16(float f) {
+   __hip_bfloat16 ret;
+   union {
+     float fp32;
+@@ -181,7 +181,7 @@
+  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
+  * \brief Converts and moves bfloat162 to float2
+  */
+-__HOST_DEVICE__ float2 __bfloat1622float2(const __hip_bfloat162 a) {
+__HOST_DEVICE__ static inline float2 __bfloat1622float2(const __hip_bfloat162 a) {
+   return float2{__bfloat162float(a.x), __bfloat162float(a.y)};
+ }
+ 
+@@ -209,7 +209,7 @@
+  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
+  * \brief Convert double to __hip_bfloat16
+  */
+-__HOST_DEVICE__ __hip_bfloat16 __double2bfloat16(const double a) {
+__HOST_DEVICE__ static inline __hip_bfloat16 __double2bfloat16(const double a) {
+   return __float2bfloat16((float)a);
+ }
+ 
+@@ -217,7 +217,7 @@
+  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
+  * \brief Convert float2 to __hip_bfloat162
+  */
+-__HOST_DEVICE__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) {
+__HOST_DEVICE__ static inline __hip_bfloat162 __float22bfloat162_rn(const float2 a) {
+   return __hip_bfloat162{__float2bfloat16(a.x), __float2bfloat16(a.y)};
+ }
+ 
+@@ -247,7 +247,7 @@
+  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
+  * \brief Converts high 16 bits of __hip_bfloat162 to float and returns the result
+  */
+-__HOST_DEVICE__ float __high2float(const __hip_bfloat162 a) { return __bfloat162float(a.y); }
+__HOST_DEVICE__ static inline float __high2float(const __hip_bfloat162 a) { return __bfloat162float(a.y); }
+ 
+ /**
+  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
+@@ -275,7 +275,7 @@
+  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
+  * \brief Converts low 16 bits of __hip_bfloat162 to float and returns the result
+  */
+-__HOST_DEVICE__ float __low2float(const __hip_bfloat162 a) { return __bfloat162float(a.x); }
+__HOST_DEVICE__ static inline float __low2float(const __hip_bfloat162 a) { return __bfloat162float(a.x); }
+ 
+ /**
+  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
--- a/tests/ops/test_selective_scan.py
+++ b/tests/ops/test_selective_scan.py
+# Copyright (C) 2023, Tri Dao.
+
+import math
+
+import torch
+import torch.nn.functional as F
+import pytest
+
+from einops import rearrange
+
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, selective_scan_ref
+from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, mamba_inner_ref
+
+
+# @pytest.mark.parametrize('wtype', [torch.float32, torch.complex64])
+@pytest.mark.parametrize('wtype', [torch.float32])
+# @pytest.mark.parametrize('itype', [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.float32])
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 372, 512, 784, 1024, 1134, 2048, 4096])
+@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
+# @pytest.mark.parametrize('seqlen', [128])
+# @pytest.mark.parametrize("return_last_state", [False, True])
+@pytest.mark.parametrize("return_last_state", [True])
+# @pytest.mark.parametrize('has_delta_bias', [False, True])
+@pytest.mark.parametrize('has_delta_bias', [True])
+# @pytest.mark.parametrize('delta_softplus', [False, True])
+@pytest.mark.parametrize('delta_softplus', [True])
+# @pytest.mark.parametrize('has_z', [False, True])
+@pytest.mark.parametrize('has_z', [True])
+# @pytest.mark.parametrize('has_D', [False, True])
+@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("varBC_groups", [1, 2])
+# @pytest.mark.parametrize("varBC_groups", [1])
+# @pytest.mark.parametrize("is_variable_C", [False, True])
+@pytest.mark.parametrize("is_variable_C", [True])
+# @pytest.mark.parametrize("is_variable_B", [False, True])
+@pytest.mark.parametrize("is_variable_B", [True])
+def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D, has_z, has_delta_bias,
+                        delta_softplus, return_last_state, seqlen, itype, wtype):
+    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
+        pytest.skip()  # This config is not applicable
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    if has_z:  # If we have z, the errors on the weights seem higher
+        rtolw = max(rtolw, rtol)
+        atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    dim = 4
+    dstate = 8
+    is_complex = wtype == torch.complex64
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    if not is_variable_B:
+        B_shape = (dim, dstate)
+    elif varBC_groups == 1:
+        B_shape = (batch_size, dstate, seqlen if not is_complex else seqlen * 2)
+    else:
+        B_shape = (batch_size, varBC_groups, dstate, seqlen if not is_complex else seqlen * 2)
+    B = torch.randn(*B_shape, device=device, dtype=wtype if not is_variable_B else itype,
+                    requires_grad=True)
+    if not is_variable_C:
+        C_shape = (dim, dstate)
+    elif varBC_groups == 1:
+        C_shape = (batch_size, dstate, seqlen if not is_complex else seqlen * 2)
+    else:
+        C_shape = (batch_size, varBC_groups, dstate, seqlen if not is_complex else seqlen * 2)
+    C = torch.randn(*C_shape, device=device, dtype=wtype if not is_variable_C else itype,
+                    requires_grad=True)
+    if has_D:
+        D = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        D = None
+    if has_z:
+        z = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    else:
+        z = None
+    if has_delta_bias:
+        delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)).requires_grad_()
+    else:
+        delta_bias = None
+    u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    delta = (0.5 * torch.rand(batch_size, dim, seqlen, device=device, dtype=itype)).requires_grad_()
+    A_ref = A.detach().clone().requires_grad_()
+    B_ref = B.detach().clone().requires_grad_()
+    C_ref = C.detach().clone().requires_grad_()
+    D_ref = D.detach().clone().requires_grad_() if D is not None else None
+    z_ref = z.detach().clone().requires_grad_() if z is not None else None
+    u_ref = u.detach().clone().requires_grad_()
+    delta_ref = delta.detach().clone().requires_grad_()
+    delta_bias_ref = delta_bias.detach().clone().requires_grad_() if delta_bias is not None else None
+    out, *rest = selective_scan_fn(
+        u, delta, A, B, C, D, z=z,
+        delta_bias=delta_bias, delta_softplus=delta_softplus,
+        return_last_state=return_last_state
+    )
+    if return_last_state:
+        state = rest[0]
+    out_ref, *rest = selective_scan_ref(
+        u_ref, delta_ref, A_ref, B_ref, C_ref, D_ref, z=z_ref,
+        delta_bias=delta_bias_ref, delta_softplus=delta_softplus,
+        return_last_state=return_last_state
+    )
+    if return_last_state:
+        state_ref = rest[0]
+    # dA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    # dt_u = delta * u
+
+    print(f'Output max diff: {(out - out_ref).abs().max().item()}')
+    print(f'Output mean diff: {(out - out_ref).abs().mean().item()}')
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+    if return_last_state:
+        print(f'State max diff: {(state - state_ref).abs().max().item()}')
+        assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+
+    g = torch.randn_like(out)
+    out_ref.backward(g)
+    out.backward(g)
+
+    print(f'du max diff: {(u.grad - u_ref.grad).abs().max().item()}')
+    print(f'ddelta max diff: {(delta.grad - delta_ref.grad).abs().max().item()}')
+    print(f'dA max diff: {(A.grad - A_ref.grad).abs().max().item()}')
+    print(f'dB max diff: {(B.grad - B_ref.grad).abs().max().item()}')
+    print(f'dC max diff: {(C.grad - C_ref.grad).abs().max().item()}')
+    if has_D:
+        print(f'dD max diff: {(D.grad - D_ref.grad).abs().max().item()}')
+    if has_z:
+        print(f'dz max diff: {(z.grad - z_ref.grad).abs().max().item()}')
+    if has_delta_bias:
+        print(f'ddelta_bias max diff: {(delta_bias.grad - delta_bias_ref.grad).abs().max().item()}')
+
+    assert torch.allclose(u.grad, u_ref.grad.to(dtype=itype), rtol=rtol * 2, atol=atol * 2)
+    assert torch.allclose(delta.grad, delta_ref.grad.to(dtype=itype), rtol=rtol * 5, atol=atol * 10)
+    assert torch.allclose(A.grad, A_ref.grad, rtol=rtolw, atol=atolw * 5)
+    assert torch.allclose(B.grad, B_ref.grad, rtol=rtolw if not is_variable_B else rtol,
+                          atol=atolw if not is_variable_B else atol)
+    assert torch.allclose(C.grad, C_ref.grad, rtol=rtolw if not is_variable_C else rtol,
+                          atol=atolw if not is_variable_C else atol)
+    if has_D:
+        assert torch.allclose(D.grad, D_ref.grad, rtol=rtolw, atol=atolw)
+    if has_z:
+        assert torch.allclose(z.grad, z_ref.grad, rtol=rtolw, atol=atolw)
+    if has_delta_bias:
+        assert torch.allclose(delta_bias.grad, delta_bias_ref.grad, rtol=rtolw, atol=atolw)
+
+
+@pytest.mark.parametrize('wtype', [torch.float32, torch.complex64])
+# @pytest.mark.parametrize('wtype', [torch.complex64])
+# @pytest.mark.parametrize('itype', [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.float32])
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 372, 512, 784, 1024, 1134, 2048, 4096])
+@pytest.mark.parametrize('seqlen', [128])
+@pytest.mark.parametrize("is_variable_C", [False, True])
+# @pytest.mark.parametrize("is_variable_C", [False])
+@pytest.mark.parametrize("is_variable_B", [False, True])
+# @pytest.mark.parametrize("is_variable_B", [True])
+def test_mamba_inner_fn(is_variable_B, is_variable_C, seqlen, itype, wtype):
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # If we have z, the errors on the weights seem higher
+    rtolw = max(rtolw, rtol)
+    atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    dim = 768
+    dstate = 8
+    dt_rank = 48
+    is_complex = wtype == torch.complex64
+    xz = torch.randn(batch_size, 2 * dim, seqlen, device=device, dtype=itype, requires_grad=True)
+    conv1d_weight = torch.randn(dim, 1, 3, device=device, dtype=torch.float32, requires_grad=True)
+    conv1d_bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    x_proj_weight = torch.randn(dt_rank + (bool(is_variable_B) + bool(is_variable_C)) * dstate
+                                * (1 if not is_complex else 2),
+                                dim, device=device, dtype=itype, requires_grad=True)
+    delta_proj_weight = torch.randn(dim, dt_rank, device=device, dtype=itype, requires_grad=True)
+    out_proj_weight = torch.randn(dim // 2, dim, device=device, dtype=itype, requires_grad=True)
+    out_proj_bias = None
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)).requires_grad_()
+    B = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_B else None)
+    C = (torch.randn(dim, dstate, device=device, dtype=wtype, requires_grad=True)
+         if not is_variable_C else None)
+    D = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)).requires_grad_()
+    B_proj_bias = None
+    C_proj_bias = None
+    xz_ref = xz.detach().clone().requires_grad_()
+    conv1d_weight_ref = conv1d_weight.detach().clone().requires_grad_()
+    conv1d_bias_ref = conv1d_bias.detach().clone().requires_grad_()
+    x_proj_weight_ref = x_proj_weight.detach().clone().requires_grad_()
+    delta_proj_weight_ref = delta_proj_weight.detach().clone().requires_grad_()
+    out_proj_weight_ref = out_proj_weight.detach().clone().requires_grad_()
+    out_proj_bias_ref = (out_proj_bias.detach().clone().requires_grad_()
+                         if out_proj_bias is not None else None)
+    A_ref = A.detach().clone().requires_grad_()
+    B_ref = B.detach().clone().requires_grad_() if B is not None else None
+    C_ref = C.detach().clone().requires_grad_() if C is not None else None
+    D_ref = D.detach().clone().requires_grad_()
+    delta_bias_ref = delta_bias.detach().clone().requires_grad_() if delta_bias is not None else None
+    out = mamba_inner_fn(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                         out_proj_weight, out_proj_bias,
+                         A, B, C, D, delta_bias=delta_bias, delta_softplus=True)
+    out_ref = mamba_inner_ref(xz_ref, conv1d_weight_ref, conv1d_bias_ref, x_proj_weight_ref,
+                              delta_proj_weight_ref, out_proj_weight_ref, out_proj_bias_ref,
+                              A_ref, B_ref, C_ref, D_ref,
+                              delta_bias=delta_bias_ref, delta_softplus=True)
+    # dA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    # dt_u = delta * u
+
+    print(f'Output max diff: {(out - out_ref).abs().max().item()}')
+    print(f'Output mean diff: {(out - out_ref).abs().mean().item()}')
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    g = torch.randn_like(out)
+    out_ref.backward(g)
+    out.backward(g)
+
+    print(f'dxz max diff: {(xz.grad - xz_ref.grad).abs().max().item()}')
+    print(f'dA max diff: {(A.grad - A_ref.grad).abs().max().item()}')
+    if not is_variable_B:
+        print(f'dB max diff: {(B.grad - B_ref.grad).abs().max().item()}')
+    if not is_variable_C:
+        print(f'dC max diff: {(C.grad - C_ref.grad).abs().max().item()}')
+    print(f'dD max diff: {(D.grad - D_ref.grad).abs().max().item()}')
+    print(f'ddelta_bias max diff: {(delta_bias.grad - delta_bias_ref.grad).abs().max().item()}')
+    print(f'dout_proj_weight max diff: {(out_proj_weight.grad - out_proj_weight_ref.grad).abs().max().item()}')
+    print(f'ddelta_proj_weight max diff: {(delta_proj_weight.grad - delta_proj_weight_ref.grad).abs().max().item()}')
+    print(f'dx_proj_weight max diff: {(x_proj_weight.grad - x_proj_weight_ref.grad).abs().max().item()}')
+    print(f'dconv1d_weight max diff: {(conv1d_weight.grad - conv1d_weight_ref.grad).abs().max().item()}')
+    print(f'dconv1d_bias max diff: {(conv1d_bias.grad - conv1d_bias_ref.grad).abs().max().item()}')
+
+    # assert torch.allclose(xz.grad, xz_ref.grad.to(dtype=itype), rtol=rtol * 2, atol=atol * 2)
+    # assert torch.allclose(delta.grad, delta_ref.grad.to(dtype=itype), rtol=rtol * 5, atol=atol * 10)
+    # assert torch.allclose(A.grad, A_ref.grad, rtol=rtolw, atol=atolw * 5)
+    # assert torch.allclose(B.grad, B_ref.grad, rtol=rtolw if not is_variable_B else rtol,
+    #                       atol=atolw if not is_variable_B else atol)
+    # assert torch.allclose(C.grad, C_ref.grad, rtol=rtolw if not is_variable_C else rtol,
+    #                       atol=atolw if not is_variable_C else atol)
+    # assert torch.allclose(D.grad, D_ref.grad, rtol=rtolw, atol=atolw)
+    # assert torch.allclose(delta_bias.grad, delta_bias_ref.grad, rtol=rtolw, atol=atolw)
--- a/tests/ops/triton/test_layernorm_gated.py
+++ b/tests/ops/triton/test_layernorm_gated.py
+import math
+
+import torch
+import torch.nn.functional as F
+
+import pytest
+
+from einops import rearrange, repeat
+
+from mamba_ssm.ops.triton.layernorm_gated import layernorm_fn, rms_norm_ref
+
+
+@pytest.mark.parametrize("norm_before_gate", [True, False])
+# @pytest.mark.parametrize("norm_before_gate", [False])
+@pytest.mark.parametrize("has_group", [False, True])
+# @pytest.mark.parametrize("has_group", [False])
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+# @pytest.mark.parametrize("is_rms_norm", [True])
+@pytest.mark.parametrize("has_z", [False, True])
+# @pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("has_bias", [False, True])
+# @pytest.mark.parametrize("has_bias", [False])
+# @pytest.mark.parametrize('dtype', [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('dtype', [torch.float16])
+# @pytest.mark.parametrize("wtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("wtype", [torch.float32])
+@pytest.mark.parametrize('d', [2048, 4096])
+# @pytest.mark.parametrize('d', [4096])
+def test_layer_norm_gated(d, dtype, wtype, has_bias, has_z, is_rms_norm, has_group, norm_before_gate):
+    if not has_z and not norm_before_gate:
+        pytest.skip()
+    if not norm_before_gate and not is_rms_norm:  # Reference LN isn't implemented for this case yet
+        pytest.skip()
+    device = 'cuda'
+    rtol, atol = (1e-5, 1e-5) if dtype == torch.float32 else (1e-2, 8e-3)
+    group_size = None if not has_group else 64
+    # set seed
+    torch.random.manual_seed(0)
+    batch = 16
+    seqlen = 1024
+    x = torch.randn(batch, seqlen, d, dtype=dtype, device=device, requires_grad=True)
+    if has_z:
+        z = torch.randn(batch, seqlen, d, dtype=dtype, device=device, requires_grad=True)
+    else:
+        z = None
+    weight = torch.randn(d, dtype=wtype, device=device, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(d, dtype=wtype, device=device, requires_grad=True)
+    else:
+        bias = None
+    x_ref = x.detach().clone().requires_grad_()
+    x_pt = x.detach().clone().requires_grad_()
+    z_ref = z.detach().clone().requires_grad_() if z is not None else None
+    z_pt = z.detach().clone().requires_grad_() if z is not None else None
+    weight_ref = weight.detach().clone().requires_grad_()
+    weight_pt = weight.detach().clone().requires_grad_()
+    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
+    bias_pt = bias.detach().clone().requires_grad_() if bias is not None else None
+    out = layernorm_fn(x, weight, bias, z=z, eps=1e-5, group_size=group_size, norm_before_gate=norm_before_gate,
+                       is_rms_norm=is_rms_norm)
+    if not is_rms_norm:
+        if not has_group:
+            out_ref = F.layer_norm(x_ref.float(), (d,), weight=weight_ref.float(), bias=bias_ref.float() if bias_ref is not None else None, eps=1e-5)
+            out_pt = F.layer_norm(x_pt.to(wtype), (d,), weight=weight_pt, bias=bias_pt, eps=1e-5)
+        else:
+            out_ref = rearrange(F.layer_norm(rearrange(x_ref, "... (g d) -> ... g d", d=group_size).float(), (group_size,), eps=1e-5), "... g d -> ... (g d)") * weight_ref.float()
+            if has_bias:
+                out_ref = out_ref + bias_ref.float()
+            out_pt = rearrange(F.layer_norm(rearrange(x_pt, "... (g d) -> ... g d", d=group_size), (group_size,), eps=1e-5), "... g d -> ... (g d)") * weight_pt
+            if has_bias:
+                out_pt = out_pt + bias_pt
+        if has_z and norm_before_gate:
+            out_ref = out_ref * F.silu(z_ref.float())
+            out_pt = out_pt * F.silu(z_pt)
+    else:
+        out_ref = rms_norm_ref(x_ref, weight_ref, bias_ref, z=z_ref, eps=1e-5, group_size=group_size,
+                               norm_before_gate=norm_before_gate)
+        out_pt = rms_norm_ref(x_pt, weight_pt, bias_pt, z=z_pt, eps=1e-5, group_size=group_size,
+                              norm_before_gate=norm_before_gate, upcast=False)
+    print(f"Max diff = {(out - out_ref).abs().max().item()}")
+    print(f"Max diff Pytorch = {(out_pt - out_ref).abs().max().item()}")
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + atol
+
+    g = torch.randn_like(out)
+    out.backward(g)
+    out_ref.backward(g)
+    out_pt.backward(g)
+    print(f"Max dx diff = {(x.grad - x_ref.grad).abs().max().item()}")
+    print(f"Max dx diff Pytorch = {(x_pt.grad - x_ref.grad).abs().max().item()}")
+    if has_z:
+        print(f"Max dz diff = {(z.grad - z_ref.grad).abs().max().item()}")
+        print(f"Max dz diff Pytorch = {(z_pt.grad - z_ref.grad).abs().max().item()}")
+    print(f"Max dw diff = {(weight.grad - weight_ref.grad).abs().max().item()}")
+    print(f"Max dw diff Pytorch = {(weight_pt.grad - weight_ref.grad).abs().max().item()}")
+    if has_bias:
+        print(f"Max db diff = {(bias.grad - bias_ref.grad).abs().max().item()}")
+        print(f"Max db diff Pytorch = {(bias_pt.grad - bias_ref.grad).abs().max().item()}")
+    assert (x.grad - x_ref.grad).abs().max().item() <= 2 * (x_pt.grad - x_ref.grad).abs().max().item() + atol
+    if has_z:
+        assert (z.grad - z_ref.grad).abs().max().item() <= 2 * (z_pt.grad - z_ref.grad).abs().max().item() + atol
+    assert (weight.grad - weight_ref.grad).abs().max().item() <= 2 * (weight_pt.grad - weight_ref.grad).abs().max().item() + atol
+    if has_bias:
+        assert (bias.grad - bias_ref.grad).abs().max().item() <= 2 * (bias_pt.grad - bias_ref.grad).abs().max().item() + atol
--- a/tests/ops/triton/test_selective_state_update.py
+++ b/tests/ops/triton/test_selective_state_update.py
+# Copyright (C) 2023, Tri Dao.
+
+import math
+
+import torch
+import torch.nn.functional as F
+import pytest
+
+from einops import rearrange
+
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update, selective_state_update_ref
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("has_z", [False, True])
+# @pytest.mark.parametrize('has_z', [True])
+@pytest.mark.parametrize("dstate", [16, 32, 64])
+# @pytest.mark.parametrize("dstate", [16])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+# @pytest.mark.parametrize("dim", [2048])
+def test_selective_state_update(dim, dstate, has_z, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+        if torch.version.hip:
+            atol *= 2
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
+    x = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(batch_size, dstate, device=device)
+    C = torch.randn(batch_size, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    if has_z:
+        z = torch.randn_like(x)
+    else:
+        z = None
+    state_ref = state.detach().clone()
+    out = selective_state_update(state, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True)
+    out_ref = selective_state_update_ref(state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("has_z", [False, True])
+# @pytest.mark.parametrize('has_z', [True])
+@pytest.mark.parametrize("tie_hdim", [False, True])
+# @pytest.mark.parametrize('tie_hdim', [True])
+@pytest.mark.parametrize("ngroups", [1, 2, 4])
+# @pytest.mark.parametrize("ngroups", [2])
+@pytest.mark.parametrize("dstate", [16, 32, 64])
+# @pytest.mark.parametrize("dstate", [16])
+@pytest.mark.parametrize("dim", [2048, 4096])
+# @pytest.mark.parametrize("dim", [2048])
+def test_selective_state_update_with_heads(dim, dstate, ngroups, has_z, tie_hdim, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 3e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 1e-1
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    headdim = 64
+    nheads = dim // headdim
+    state = torch.randn(batch_size, nheads, headdim, dstate, dtype=itype, device=device)
+    x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
+    if not tie_hdim:
+        dt = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
+        dt_bias = torch.rand(nheads, headdim, device=device) - 4.0
+        A = -torch.rand(nheads, headdim, dstate, device=device) - 1.0
+        D = torch.randn(nheads, headdim, device=device)
+    else:
+        dt = repeat(torch.randn(batch_size, nheads, device=device, dtype=itype), "b h -> b h p", p=headdim)
+        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0, "h -> h p", p=headdim)
+        A = repeat(-torch.rand(nheads, device=device) - 1.0, "h -> h p n", p=headdim, n=dstate)
+        D = repeat(torch.randn(nheads, device=device), "h -> h p", p=headdim)
+    B = torch.randn(batch_size, ngroups, dstate, device=device)
+    C = torch.randn(batch_size, ngroups, dstate, device=device)
+    if has_z:
+        z = torch.randn_like(x)
+    else:
+        z = None
+    state_ref = state.detach().clone()
+    state_og = state.detach().clone()
+    out = selective_state_update(state, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True)
+    out_ref = selective_state_update_ref(state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
--- a/tests/ops/triton/test_ssd.py
+++ b/tests/ops/triton/test_ssd.py
+import math
+
+import torch
+import torch.nn.functional as F
+
+import pytest
+
+from einops import rearrange, repeat
+
+from mamba_ssm.ops.triton.ssd_chunk_state import chunk_state, chunk_state_ref
+from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_cumsum_fwd, _chunk_state_fwd
+from mamba_ssm.ops.triton.ssd_chunk_state import chunk_state_varlen
+from mamba_ssm.ops.triton.ssd_state_passing import state_passing, state_passing_ref
+from mamba_ssm.ops.triton.ssd_state_passing import _state_passing_fwd
+from mamba_ssm.ops.triton.ssd_chunk_scan import chunk_scan, chunk_scan_ref
+from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_chunk_scan, ssd_chunk_scan_combined_ref, ssd_selective_scan
+from mamba_ssm.ops.triton.ssd_combined import mamba_split_conv1d_scan_combined, mamba_split_conv1d_scan_ref
+
+
+def detach_clone(*args):
+    return tuple([arg.detach().clone().requires_grad_() if arg is not None else None for arg in args])
+
+
+@pytest.mark.parametrize('dtype', [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('dtype', [torch.bfloat16])
+@pytest.mark.parametrize('ngroups', [1, 2, 8, "max"])
+# @pytest.mark.parametrize('ngroups', [1])
+@pytest.mark.parametrize('chunk_size', [64, 128])
+# @pytest.mark.parametrize('chunk_size', [128])
+def test_chunk_state_varlen(chunk_size, ngroups, dtype):
+    device = 'cuda'
+    rtol, atol = (1e-2, 3e-3)
+    # set seed
+    torch.random.manual_seed(chunk_size + (ngroups if ngroups != "max" else 64))
+    batch = 300
+    seqlens = torch.randint(1, 200, (batch,), device=device)
+    # batch = 3
+    # seqlens = torch.tensor([201, 56, 5], device=device)
+    cu_seqlens = F.pad(seqlens.cumsum(0), (1, 0))
+    total_seqlen = seqlens.sum().item()
+    seq_idx = torch.cat([torch.full((s,), i, dtype=torch.int32, device=device) for i, s in enumerate(seqlens)], dim=0).unsqueeze(0)
+    dim = 4096
+    # dim = 64
+    headdim = 64
+    # dim = 32
+    dstate = 32
+    assert dim % headdim == 0
+    nheads = dim // headdim
+    if ngroups == "max":
+        ngroups = nheads
+    assert nheads % ngroups == 0
+    B = torch.randn(total_seqlen, ngroups, dstate, dtype=dtype, device=device) / 5
+    x = torch.randn(total_seqlen, nheads, headdim, dtype=dtype, device=device)
+    A = -0.1 * (torch.rand(nheads, device=device))
+    dt = F.softplus(torch.randn(total_seqlen, nheads, device=device, dtype=torch.float32) - 4)
+    dA_cumsum, dt_rounded = _chunk_cumsum_fwd(dt.unsqueeze(0), A, chunk_size)
+    chunk_states = _chunk_state_fwd(B.unsqueeze(0), x.unsqueeze(0), dt_rounded, dA_cumsum, seq_idx=seq_idx)
+    chunk_states, _ = _state_passing_fwd(rearrange(chunk_states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1],
+                                         seq_idx=seq_idx, chunk_size=chunk_size)
+    chunk_states = rearrange(chunk_states, "... (p n) -> ... p n", n=dstate)
+    chunk_states = chunk_states.squeeze(0)
+    dA_cumsum = dA_cumsum.squeeze(0)
+    dt_rounded = dt_rounded.squeeze(0)
+    out = chunk_state_varlen(B, x, dt_rounded, dA_cumsum, cu_seqlens, chunk_states)
+    out_ref = []
+    for b in range(batch):
+        x_s = x[cu_seqlens[b]:cu_seqlens[b + 1]].unsqueeze(0)
+        B_s = B[cu_seqlens[b]:cu_seqlens[b + 1]].unsqueeze(0)
+        dt_s = dt[cu_seqlens[b]:cu_seqlens[b + 1]].unsqueeze(0)
+        dA_cumsum_s, dt_rounded_s = _chunk_cumsum_fwd(dt_s, A, chunk_size)
+        states = chunk_state(B_s, x_s, dt_rounded_s, dA_cumsum_s)
+        _, final_states = _state_passing_fwd(rearrange(states, "... p n -> ... (p n)"), dA_cumsum_s[:, :, :, -1],
+                                             chunk_size=chunk_size)
+        final_states = rearrange(final_states, "... (p n) -> ... p n", n=dstate)
+        out_ref.append(final_states)
+    out_ref = torch.cat(out_ref, dim=0)
+    print(f"Max diff = {(out - out_ref).abs().max().item()}")
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
+import torch
+import torch.nn.functional as F
+
+from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
+from mamba_ssm.models.config_mamba import MambaConfig
+from mamba_ssm.utils.generation import InferenceParams
+
+import pytest
+
+from einops import rearrange, repeat
+
+
+def test_generation():
+    batch = 3
+    seqlen = 20
+    device = "cuda"
+    dtype = torch.float16
+
+    config = MambaConfig(
+        d_model=1024,
+        n_layer=4,
+        vocab_size=50277,
+        ssm_cfg=dict(layer="Mamba2"),
+        rms_norm=True,
+        residual_in_fp32=True,
+        fused_add_norm=True,
+        pad_vocab_size_multiple=16,
+    )
+    torch.manual_seed(2357)
+    model = MambaLMHeadModel(config, device=device, dtype=dtype)
+    x = torch.randint(0, 1000, (batch, seqlen), device=device, dtype=torch.long)
+    out_ref = model(x).logits
+    prompt_len = seqlen // 2
+    out = model.generate(
+        input_ids = x[:, :prompt_len], max_length=seqlen, output_scores=True, return_dict_in_generate=True,
+        cg=True,  # Can turn off CUDA graph for easier debugging
+        # instead of sampling, we take output tokens from x, to get logits for testing
+        # For actual generation, don't pass in teacher_outputs
+        teacher_outputs=x,
+    )
+    out_scores = torch.stack(out.scores, dim=1)
+    print(f"Max diff: {(out_scores - out_ref[:, prompt_len - 1: -1]).abs().max()}")
+    assert torch.allclose(out_scores, out_ref[:, prompt_len - 1: -1], rtol=1e-3, atol=1e-2)
+
+
+def test_generation_varlen():
+    seqlens = [170, 65, 100]
+    genlen = 20
+    total_seqlen = sum(seqlens)
+    device = "cuda"
+    dtype = torch.float16
+
+    config = MambaConfig(
+        d_model=1024,
+        n_layer=4,
+        vocab_size=50277,
+        ssm_cfg=dict(layer="Mamba2"),
+        rms_norm=True,
+        residual_in_fp32=True,
+        fused_add_norm=True,
+        pad_vocab_size_multiple=16,
+    )
+    torch.manual_seed(2357)
+    model = MambaLMHeadModel(config, device=device, dtype=dtype)
+    xs = [torch.randint(0, 1000, (1, seqlen), device=device, dtype=torch.long) for seqlen in seqlens]
+
+    # Reference 1: Forward pass with seq_idx
+    x = torch.cat(xs, dim=1)
+    seq_idx = torch.cat([torch.full((ids.shape[1],), i, dtype=torch.int32, device=device)
+                         for i, ids in enumerate(xs)], dim=0).unsqueeze(0)
+    cu_seqlens = F.pad(torch.tensor(seqlens, device=device, dtype=torch.int32).cumsum(dim=0), (1, 0))
+    out_ref = model(x, seq_idx=seq_idx).logits
+    # Only take the last @genlen logits of each sequence
+    out_ref = torch.cat([out_ref[:, cu_seqlens[i + 1] - genlen - 1:cu_seqlens[i + 1] - 1]
+                         for i in range(len(seqlens))], dim=0)
+
+    # Reference 2: Generate the last @genlen tokens of each sequence in a for loop
+    out_loop = []
+    for input_ids in xs:
+        out = model.generate(
+            input_ids=input_ids[:, :-genlen], max_length=input_ids.shape[1], output_scores=True,
+            return_dict_in_generate=True, cg=True, teacher_outputs=input_ids,
+        ).scores
+        out_loop.append(torch.stack(out, dim=1))
+    out_loop = torch.cat(out_loop, dim=0)
+    print(f"Max diff between ref1 and ref2: {(out_loop - out_ref).abs().max()}")
+
+    # Varlen generation
+    input_ids = torch.cat([ids[:, :-genlen] for ids in xs], dim=1)
+    prompt_seqlens = [seqlen - genlen for seqlen in seqlens]
+    cu_seqlens = F.pad(torch.tensor(prompt_seqlens, device=device, dtype=torch.int32).cumsum(dim=0), (1, 0))
+    seq_idx = torch.cat([torch.full((seqlen,), i, dtype=torch.int32, device=device)
+                         for i, seqlen in enumerate(prompt_seqlens)], dim=0).unsqueeze(0)
+    inference_params = InferenceParams(max_seqlen=2048, max_batch_size=len(seqlens))
+
+    scores, sequences = [], []
+    # Both seq_idx and cu_seqlens must be passed in for varlen generation
+    logits = model(input_ids, inference_params=inference_params, seq_idx=seq_idx, cu_seqlens=cu_seqlens).logits
+    logits = rearrange(logits[0, cu_seqlens[1:] - 1], "b d -> b 1 d")
+    scores.append(logits)
+    # In practice we should sample. In this case we take from the teacher_output for testing
+    sampled_tokens = rearrange(torch.stack([ids[0, -genlen] for ids in xs], dim=0), "b -> b 1")
+    sequences.append(sampled_tokens)
+    for i in range(1, genlen):
+        inference_params.seqlen_offset += 1
+        logits = model(sampled_tokens, inference_params=inference_params, num_last_tokens=1).logits
+        scores.append(logits)
+        # In practice we should sample. In this case we take from the teacher_output for testing
+        sampled_tokens = rearrange(torch.stack([ids[0, -genlen + i] for ids in xs], dim=0), "b -> b 1")
+        sequences.append(sampled_tokens)
+    out_varlen = torch.cat(scores, dim=1)
+    print(f"Max diff: {(out_varlen - out_ref).abs().max()}")
+    assert (out_varlen - out_ref).abs().max() < 2 * (out_loop - out_ref).abs().max()