修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/causal-conv1d-1.4.0/setup.py
+++ b/causal-conv1d-1.4.0/setup.py
+# Copyright (c) 2024, Tri Dao.
+
+import sys
+import warnings
+import os
+import re
+import shutil
+import ast
+from pathlib import Path
+from packaging.version import parse, Version
+import platform
+
+from setuptools import setup, find_packages
+import subprocess
+
+import urllib.request
+import urllib.error
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME, HIP_HOME
+
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+
+PACKAGE_NAME = "causal_conv1d"
+
+BASE_WHEEL_URL = "https://github.com/Dao-AILab/causal-conv1d/releases/download/{tag_name}/{wheel_name}"
+
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("CAUSAL_CONV1D_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("CAUSAL_CONV1D_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("CAUSAL_CONV1D_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+
+
+def get_platform():
+    """
+    Returns the platform name as used in wheel filenames.
+    """
+    if sys.platform.startswith("linux"):
+        return "linux_x86_64"
+    elif sys.platform == "darwin":
+        mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
+        return f"macosx_{mac_version}_x86_64"
+    elif sys.platform == "win32":
+        return "win_amd64"
+    else:
+        raise ValueError("Unsupported platform: {}".format(sys.platform))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+
+    return raw_output, bare_metal_version
+
+
+def get_hip_version(rocm_dir):
+
+    hipcc_bin = "hipcc" if rocm_dir is None else os.path.join(rocm_dir, "bin", "hipcc")
+    try:
+        raw_output = subprocess.check_output(
+            [hipcc_bin, "--version"], universal_newlines=True
+        )
+    except Exception as e:
+        print(
+            f"hip installation not found: {e} ROCM_PATH={os.environ.get('ROCM_PATH')}"
+        )
+        return None, None
+    
+    for line in raw_output.split("\n"):
+        if "HIP version" in line:
+            rocm_version = parse(line.split()[-1].replace("-", "+")) # local version is not parsed correctly
+            return line, rocm_version
+
+    return None, None
+
+
+def get_torch_hip_version():
+    if torch.version.hip:
+        return parse(torch.version.hip.split()[-1].replace("-", "+"))
+    else:
+        return None
+
+
+def check_if_hip_home_none(global_option: str) -> None:
+    
+    if HIP_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so hipcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but hipcc was not found.  Are you sure your environment has hipcc available?"
+    )
+
+
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+
+
+def append_nvcc_threads(nvcc_extra_args):
+    return nvcc_extra_args + ["--threads", "4"]
+
+
+cmdclass = {}
+ext_modules = []
+
+
+HIP_BUILD = bool(torch.version.hip)
+
+if not SKIP_CUDA_BUILD:
+
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+
+    cc_flag = []
+
+    if HIP_BUILD:
+        check_if_hip_home_none(PACKAGE_NAME)
+
+        rocm_home = os.getenv("ROCM_PATH")
+        _, hip_version = get_hip_version(rocm_home)
+
+        
+        # if HIP_HOME is not None:
+        #     if hip_version < Version("6.0"):
+        #         raise RuntimeError(
+        #             f"{PACKAGE_NAME} is only supported on ROCm 6.0 and above.  "
+        #             "Note: make sure HIP has a supported version by running hipcc --version."
+        #         )
+        #     if hip_version == Version("6.0"):
+        #         warnings.warn(
+        #             f"{PACKAGE_NAME} requires a patch to be applied when running on ROCm 6.0. "
+        #             "Refer to the README.md for detailed instructions.",
+        #             UserWarning
+        #         )
+
+        cc_flag.append("-DBUILD_PYTHON_PACKAGE")
+
+    else:
+        check_if_cuda_home_none(PACKAGE_NAME)
+        # Check, if CUDA11 is installed for compute capability 8.0
+        
+        if CUDA_HOME is not None:
+            _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+            if bare_metal_version < Version("11.6"):
+                raise RuntimeError(
+                    f"{PACKAGE_NAME} is only supported on CUDA 11.6 and above.  "
+                    "Note: make sure nvcc has a supported version by running nvcc -V."
+                )
+                    
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_53,code=sm_53")
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_62,code=sm_62")
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_70,code=sm_70")
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_72,code=sm_72")
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_80,code=sm_80")
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_87,code=sm_87")
+        if bare_metal_version >= Version("11.8"):
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_90,code=sm_90")
+
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+
+
+    if HIP_BUILD:
+        extra_compile_args = {
+            "cxx": ["-O3", "-std=c++17"],
+            "nvcc": [
+                "-O3",
+                "-std=c++17",
+                f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
+                "-U__CUDA_NO_HALF_OPERATORS__",
+                "-U__CUDA_NO_HALF_CONVERSIONS__",
+                "-fgpu-flush-denormals-to-zero",
+            ]
+            + cc_flag,
+        }
+    else:
+        extra_compile_args = {
+            "cxx": ["-O3"],
+            "nvcc": append_nvcc_threads(
+                [
+                    "-O3",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                    "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                    "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "--use_fast_math",
+                    "--ptxas-options=-v",
+                    "-lineinfo",
+                ]
+                + cc_flag
+            ),
+        }
+
+    ext_modules.append(
+        CUDAExtension(
+            name="causal_conv1d_cuda",
+            sources=[
+                "csrc/causal_conv1d.cpp",
+                "csrc/causal_conv1d_fwd.cu",
+                "csrc/causal_conv1d_bwd.cu",
+                "csrc/causal_conv1d_update.cu",
+            ],
+            extra_compile_args=extra_compile_args,
+            include_dirs=[Path(this_dir) / "csrc" / "causal_conv1d"],
+        )
+    )
+
+
+def get_package_version():
+    with open(Path(this_dir) / "causal_conv1d" / "__init__.py", "r") as f:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+    public_version = ast.literal_eval(version_match.group(1))
+    local_version = os.environ.get("CAUSAL_CONV1D_LOCAL_VERSION")
+    if local_version:
+        return f"{public_version}+{local_version}"
+    else:
+        return str(public_version)
+
+
+def get_wheel_url():
+
+    # Determine the version numbers that will be used to determine the correct wheel
+    torch_version_raw = parse(torch.__version__)
+
+    if HIP_BUILD:
+        # We're using the HIP version used to build torch, not the one currently installed
+        torch_hip_version = get_torch_hip_version()
+        hip_version = f"{torch_hip_version.major}{torch_hip_version.minor}"
+    else:
+        # We're using the CUDA version used to build torch, not the one currently installed
+        # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
+        torch_cuda_version = parse(torch.version.cuda)
+        # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.2
+        # to save CI time. Minor versions should be compatible.
+        torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.2")
+        cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
+    
+    gpu_compute_version = hip_version if HIP_BUILD else cuda_version
+    cuda_or_hip = "hip" if HIP_BUILD else "cu"
+
+    python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+    platform_name = get_platform()
+    causal_conv1d_version = get_package_version()
+
+    torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}"
+    cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()
+
+    # Determine wheel URL based on CUDA version, torch version, python version and OS
+    wheel_filename = f"{PACKAGE_NAME}-{causal_conv1d_version}+{cuda_or_hip}{gpu_compute_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl"
+
+    wheel_url = BASE_WHEEL_URL.format(
+        tag_name=f"v{causal_conv1d_version}", wheel_name=wheel_filename
+    )
+    return wheel_url, wheel_filename
+
+
+class CachedWheelsCommand(_bdist_wheel):
+    """
+    The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
+    find an existing wheel (which is currently the case for all installs). We use
+    the environment parameters to detect whether there is already a pre-built version of a compatible
+    wheel available and short-circuits the standard full build pipeline.
+    """
+
+    def run(self):
+        if FORCE_BUILD:
+            return super().run()
+
+        wheel_url, wheel_filename = get_wheel_url()
+        print("Guessing wheel URL: ", wheel_url)
+        try:
+            urllib.request.urlretrieve(wheel_url, wheel_filename)
+
+            # Make the archive
+            # Lifted from the root wheel processing command
+            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
+            if not os.path.exists(self.dist_dir):
+                os.makedirs(self.dist_dir)
+
+            impl_tag, abi_tag, plat_tag = self.get_tag()
+            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
+
+            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
+            print("Raw wheel path", wheel_path)
+            shutil.move(wheel_filename, wheel_path)
+        except urllib.error.HTTPError:
+            print("Precompiled wheel not found. Building from source...")
+            # If the wheel could not be downloaded, build from source
+            super().run()
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=get_package_version(),
+    packages=find_packages(
+        exclude=(
+            "build",
+            "csrc",
+            "include",
+            "tests",
+            "dist",
+            "docs",
+            "benchmarks",
+            "causal_conv1d.egg-info",
+        )
+    ),
+    author="Tri Dao",
+    author_email="tri@tridao.me",
+    description="Causal depthwise conv1d in CUDA, with a PyTorch interface",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/Dao-AILab/causal-conv1d",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: Unix",
+    ],
+    ext_modules=ext_modules,
+    cmdclass={"bdist_wheel": CachedWheelsCommand, "build_ext": BuildExtension}
+    if ext_modules
+    else {
+        "bdist_wheel": CachedWheelsCommand,
+    },
+    python_requires=">=3.8",
+    install_requires=[
+        "torch",
+        "packaging",
+        "ninja",
+    ],
+)
--- a/causal-conv1d-1.4.0/tests/test_causal_conv1d.py
+++ b/causal-conv1d-1.4.0/tests/test_causal_conv1d.py
+# Copyright (C) 2024, Tri Dao.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+import pytest
+
+from einops import rearrange
+
+from causal_conv1d.causal_conv1d_interface import causal_conv1d_fn, causal_conv1d_ref
+from causal_conv1d.causal_conv1d_interface import causal_conv1d_update, causal_conv1d_update_ref
+from causal_conv1d.causal_conv1d_varlen import causal_conv1d_varlen_states, causal_conv1d_varlen_states_ref
+
+
+@pytest.mark.parametrize("return_final_states", [False, True])
+# @pytest.mark.parametrize("return_final_states", [True])
+@pytest.mark.parametrize("has_initial_states", [False, True])
+# @pytest.mark.parametrize("has_initial_states", [False])
+@pytest.mark.parametrize("channel_last", [False, True])
+# @pytest.mark.parametrize('channel_last', [True])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+# @pytest.mark.parametrize('silu_activation', [True])
+@pytest.mark.parametrize("has_bias", [False, True])
+# @pytest.mark.parametrize('has_bias', [True])
+@pytest.mark.parametrize("width", [2, 3, 4])
+# @pytest.mark.parametrize('width', [3])
+@pytest.mark.parametrize(
+    "seqlen", [1, 2, 8, 16, 32, 64, 128, 129, 130, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
+)
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
+# @pytest.mark.parametrize('seqlen', [128])
+@pytest.mark.parametrize('dim', [64, 4096 + 32])
+# @pytest.mark.parametrize('dim', [64])
+def test_causal_conv1d(dim, seqlen, width, has_bias, silu_activation, itype, channel_last, has_initial_states, return_final_states):
+    if not channel_last and (has_initial_states or return_final_states):
+        pytest.skip("Only channel_last support initial_states or return_final_states")
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch = 2
+    # batch = 1
+    if not channel_last:
+        x = torch.randn(batch, 4096 + dim + 64, seqlen, device=device, dtype=itype)[:, 4096:4096 + dim, :].requires_grad_()
+    else:
+        x = rearrange(
+            torch.randn(batch, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
+        ).requires_grad_()
+    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        bias = None
+    if has_initial_states:
+        initial_states = torch.randn(batch, width - 1, dim, device=device, dtype=itype).transpose(1, 2).requires_grad_()
+    else:
+        initial_states = None
+    x_ref = x.detach().clone().requires_grad_()
+    weight_ref = weight.detach().clone().requires_grad_()
+    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
+    initial_states_ref = initial_states.detach().clone().requires_grad_() if initial_states is not None else None
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_fn(x, weight, bias, initial_states=initial_states, return_final_states=return_final_states,
+                           activation=activation)
+    out_ref = causal_conv1d_ref(x_ref, weight_ref, bias_ref, initial_states=initial_states_ref, return_final_states=return_final_states, activation=activation)
+    if return_final_states:
+        out, final_states = out
+        out_ref, final_states_ref = out_ref
+        print(f"Final states max diff: {(final_states - final_states_ref).abs().max().item()}")
+        print(f"Final states mean diff: {(final_states - final_states_ref).abs().mean().item()}")
+        assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    if return_final_states:
+        out += F.sigmoid(final_states).sum(dim=-1, keepdim=True)
+        out_ref += F.sigmoid(final_states_ref).sum(dim=-1, keepdim=True)
+
+    g = torch.randn_like(out)
+    out.backward(g)
+    out_ref.backward(g)
+
+    print(f"dx max diff: {(x.grad - x_ref.grad).abs().max().item()}")
+    print(f"dweight max diff: {(weight.grad - weight_ref.grad).abs().max().item()}")
+    if has_bias:
+        print(f"dbias max diff: {(bias.grad - bias_ref.grad).abs().max().item()}")
+    if has_initial_states:
+        print(f"dinitial_states max diff: {(initial_states.grad - initial_states_ref.grad).abs().max().item()}")
+
+    assert torch.allclose(x.grad, x_ref.grad.to(dtype=itype), rtol=rtol, atol=atol)
+    assert torch.allclose(weight.grad, weight_ref.grad, rtol=rtolw, atol=atolw)
+    if has_bias:
+        assert torch.allclose(bias.grad, bias_ref.grad, rtol=rtolw, atol=atolw)
+    if has_initial_states:
+        assert torch.allclose(initial_states.grad, initial_states_ref.grad.to(dtype=itype), rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+# @pytest.mark.parametrize('silu_activation', [True])
+@pytest.mark.parametrize("has_bias", [False, True])
+# @pytest.mark.parametrize('has_bias', [True])
+@pytest.mark.parametrize("has_cache_seqlens", [False, True])
+# @pytest.mark.parametrize('has_cache_seqlens', [True])
+@pytest.mark.parametrize("seqlen", [1, 4, 5])
+# @pytest.mark.parametrize('seqlen', [4])
+@pytest.mark.parametrize("width", [2, 3, 4])
+# @pytest.mark.parametrize('width', [4])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+# @pytest.mark.parametrize("dim", [2048])
+def test_causal_conv1d_update(dim, width, seqlen, has_cache_seqlens, has_bias, silu_activation, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch = 64
+    # batch = 1
+    # dim = 64
+    x = torch.randn(batch, seqlen, dim, device=device, dtype=itype).transpose(-1, -2)
+    state_len = torch.randint(width - 1, width + 10, (1,)).item()
+    conv_state = torch.randn(batch, state_len, dim, device=device, dtype=itype).transpose(-1, -2)
+    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        bias = None
+    conv_state_ref = conv_state.detach().clone()
+    activation = None if not silu_activation else "silu"
+    cache_seqlens = (torch.randint(0, 1024, (batch,), dtype=torch.int32, device=device)
+                     if has_cache_seqlens else None)
+    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation, cache_seqlens=cache_seqlens)
+    out_ref = causal_conv1d_update_ref(x, conv_state_ref, weight, bias, activation=activation, cache_seqlens=cache_seqlens)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.equal(conv_state, conv_state_ref)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+# @pytest.mark.parametrize("dim", [2048])
+def test_causal_conv1d_get_states(dim, itype):
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    seqlens = torch.randint(1, 32, (100,), device=device)
+    total_seqlen = seqlens.sum().item()
+    x = torch.randn(total_seqlen, dim, device=device, dtype=itype)
+    cu_seqlens = F.pad(seqlens.cumsum(0), (1, 0))
+    state_len = 20
+    out = causal_conv1d_varlen_states(x, cu_seqlens, state_len)
+    out_ref = causal_conv1d_varlen_states_ref(x, cu_seqlens, state_len)
+    assert torch.equal(out, out_ref)
+
+
+# @pytest.mark.parametrize("channel_last", [False, True])
+@pytest.mark.parametrize('channel_last', [True])
+# @pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize('itype', [torch.bfloat16])
+# @pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize('silu_activation', [True])
+# @pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize('has_bias', [True])
+# @pytest.mark.parametrize("width", [2, 3, 4])
+@pytest.mark.parametrize('width', [4])
+@pytest.mark.parametrize(
+    # "seqlen", [8, 16, 32, 64, 128, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
+    "seqlen", [2048]
+)
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
+# @pytest.mark.parametrize('seqlen', [128])
+def test_causal_conv1d_race_condition(seqlen, width, has_bias, silu_activation, itype, channel_last):
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch = 2
+    # batch = 1
+    dim = 4096 + 32  # Try dim not divisible by 64
+    # dim = 64
+    if not channel_last:
+        x = torch.randn(batch, 4096 + dim + 64, seqlen, device=device, dtype=itype)[:, 4096:4096 + dim, :].requires_grad_()
+    else:
+        x = rearrange(
+            torch.randn(batch, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
+        ).requires_grad_()
+    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        bias = None
+    activation = None if not silu_activation else "silu"
+    out0 = causal_conv1d_fn(x, weight, bias, activation=activation)
+    g = torch.randn_like(out0)
+    dx0, dw0, db0 = torch.autograd.grad(out0, (x, weight, bias), g)
+    dw_atol = 1e-4
+    db_atol = 1e-4
+
+    for i in range(10000):
+        out = causal_conv1d_fn(x, weight, bias, activation=activation)
+        dx, dw, db = torch.autograd.grad(out, (x, weight, bias), g)
+        dw_equal = torch.allclose(dw, dw0, atol=dw_atol)
+        # if not dw_equal:
+        #     breakpoint()
+        if has_bias:
+            db_equal = torch.allclose(db, db0, atol=db_atol)
+            # if not db_equal:
+            #     breakpoint()
+        assert torch.equal(out, out0)
+        assert torch.equal(dx, dx0)
+        assert dw_equal
+        if has_bias:
+            assert dw_equal
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('itype', [torch.float16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+# @pytest.mark.parametrize('silu_activation', [False])
+@pytest.mark.parametrize("has_bias", [False, True])
+# @pytest.mark.parametrize('has_bias', [False])
+@pytest.mark.parametrize("width", [2, 3, 4])
+# @pytest.mark.parametrize('width', [2])
+@pytest.mark.parametrize(
+    "seqlen", [8, 16, 32, 64, 128, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
+)
+# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
+# @pytest.mark.parametrize('seqlen', [2048])
+@pytest.mark.parametrize('dim', [64, 4096 + 32])
+# @pytest.mark.parametrize('dim', [64])
+def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    # set seed
+    torch.random.manual_seed(seqlen + dim + width)
+    batch = 3
+    seqlens = []
+    for b in range(batch):
+        nsplits = torch.randint(1, 5, (1,)).item()
+        eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
+        seqlens.append(torch.diff(torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])).tolist())
+        assert sum(seqlens[-1]) == seqlen
+        assert all(s > 0 for s in seqlens[-1])
+    # Only support channel_last
+    x = rearrange(
+        torch.randn(batch, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
+    ).requires_grad_()
+    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
+    else:
+        bias = None
+    seq_idx = torch.stack([torch.cat([torch.full((s,), i, dtype=torch.int32, device=device) for i, s in enumerate(sl)], dim=0)
+                           for sl in seqlens], dim=0)
+    x_ref = x.detach().clone().requires_grad_()
+    weight_ref = weight.detach().clone().requires_grad_()
+    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_fn(x, weight, bias, seq_idx=seq_idx, activation=activation)
+    out_ref = []
+    for b in range(batch):
+        out_ref_b = []
+        for x_s in torch.split(x_ref[[b]], seqlens[b], dim=2):
+            out_ref_b.append(causal_conv1d_ref(x_s, weight_ref, bias_ref, activation=activation))
+        out_ref.append(torch.cat(out_ref_b, dim=2))
+    out_ref = torch.cat(out_ref, dim=0)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    g = torch.randn_like(out)
+    out_ref.backward(g)
+    out.backward(g)
+
+    print(f"dx max diff: {(x.grad - x_ref.grad).abs().max().item()}")
+    print(f"dweight max diff: {(weight.grad - weight_ref.grad).abs().max().item()}")
+    if has_bias:
+        print(f"dbias max diff: {(bias.grad - bias_ref.grad).abs().max().item()}")
+
+    assert torch.allclose(x.grad, x_ref.grad.to(dtype=itype), rtol=rtol, atol=atol)
+    assert torch.allclose(weight.grad, weight_ref.grad, rtol=rtolw, atol=atolw)
+    if has_bias:
+        assert torch.allclose(bias.grad, bias_ref.grad, rtol=rtolw, atol=atolw)
--- a/csrc/selective_scan/reverse_scan.cuh
+++ b/csrc/selective_scan/reverse_scan.cuh
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#ifndef USE_ROCM
+    #include <cub/config.cuh>
+    
+    #include <cub/util_ptx.cuh>
+    #include <cub/util_type.cuh>
+    #include <cub/block/block_raking_layout.cuh>
+    // #include <cub/detail/uninitialized_copy.cuh>
+#else
+    #include <hipcub/hipcub.hpp>
+    namespace cub = hipcub;
+#endif
+#include "uninitialized_copy.cuh"
+
+/**
+ * Perform a reverse sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReverseReduce(const T (&input)[LENGTH], ReductionOp reduction_op) {
+    static_assert(LENGTH > 0);
+    T retval = input[LENGTH - 1];
+    #pragma unroll
+    for (int i = LENGTH - 2; i >= 0; --i) { retval = reduction_op(retval, input[i]); }
+    return retval;
+}
+
+/**
+ * Perform a sequential inclusive postfix reverse scan over the statically-sized \p input array, seeded with the specified \p postfix.  The aggregate is returned.
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadReverseScanInclusive(
+    const T (&input)[LENGTH],
+    T (&output)[LENGTH],
+    ScanOp scan_op,
+    const T postfix)
+{
+    T inclusive = postfix;
+    #pragma unroll
+    for (int i = LENGTH - 1; i >= 0; --i) {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+    return inclusive; 
+}
+
+/**
+ * Perform a sequential exclusive postfix reverse scan over the statically-sized \p input array, seeded with the specified \p postfix.  The aggregate is returned.
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadReverseScanExclusive(
+    const T (&input)[LENGTH],
+    T (&output)[LENGTH],
+    ScanOp scan_op,
+    const T postfix)
+{
+    // Careful, output maybe be aliased to input
+    T exclusive = postfix;
+    T inclusive;
+    #pragma unroll
+    for (int i = LENGTH - 1; i >= 0; --i) {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+    return inclusive;
+}
+
+
+/**
+ * \brief WarpReverseScan provides SHFL-based variants of parallel postfix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS    ///< Number of threads per logical warp
+    >
+struct WarpReverseScan {
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    /// Whether the logical warp size and the PTX warp size coincide
+
+    // In hipcub, warp_threads is defined as HIPCUB_WARP_THREADS ::rocprim::warp_size()
+    // While in cub, it's defined as a macro that takes a redundant unused argument.
+    #ifndef USE_ROCM
+        #define WARP_THREADS CUB_WARP_THREADS(0)
+    #else
+        #define WARP_THREADS HIPCUB_WARP_THREADS
+    #endif
+    static constexpr bool IS_ARCH_WARP = (LOGICAL_WARP_THREADS == WARP_THREADS);
+    /// The number of warp scan steps
+    static constexpr int STEPS = cub::Log2<LOGICAL_WARP_THREADS>::VALUE;
+    static_assert(LOGICAL_WARP_THREADS == 1 << STEPS);
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    explicit __device__ __forceinline__
+    WarpReverseScan()
+        : lane_id(cub::LaneId())
+        , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
+        , member_mask(cub::WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+    {
+        if (!IS_ARCH_WARP) {
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+        }
+    }
+
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return cub::ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+    }
+
+
+    /// Inclusive scan
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveReverseScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++) {
+            int offset = 1 << STEP;
+            T temp = cub::ShuffleDown<LOGICAL_WARP_THREADS>(
+                inclusive_output, offset, LOGICAL_WARP_THREADS - 1, member_mask
+            );
+            // Perform scan op if from a valid peer
+            inclusive_output = static_cast<int>(lane_id) >= LOGICAL_WARP_THREADS - offset
+                ? inclusive_output : scan_op(temp, inclusive_output);
+        }
+    }
+
+    /// Exclusive scan
+    // Get exclusive from inclusive
+    template <typename ScanOpT>
+    __device__ __forceinline__ void ExclusiveReverseScan(
+        T              input,              ///< [in] Calling thread's input item.
+        T              &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT        scan_op,            ///< [in] Binary scan operator
+        T              &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T inclusive_output;
+        InclusiveReverseScan(input, inclusive_output, scan_op);
+        warp_aggregate = cub::ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, 0, member_mask);
+        // initial value unknown
+        exclusive_output = cub::ShuffleDown<LOGICAL_WARP_THREADS>(
+            inclusive_output, 1, LOGICAL_WARP_THREADS - 1, member_mask
+        );
+    }
+
+    /**
+     * \brief Computes both inclusive and exclusive reverse scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for the last <em>warp-lane</em> is undefined.
+     */
+    template <typename ScanOpT>
+    __device__ __forceinline__ void ReverseScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveReverseScan(input, inclusive_output, scan_op);
+        // initial value unknown
+        exclusive_output = cub::ShuffleDown<LOGICAL_WARP_THREADS>(
+            inclusive_output, 1, LOGICAL_WARP_THREADS - 1, member_mask
+        );
+    }
+
+};
+
+/**
+ * \brief BlockReverseScan provides variants of raking-based parallel postfix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    bool        MEMOIZE=false   ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    >
+struct BlockReverseScan {
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    /// The thread block size in threads
+    static constexpr int BLOCK_THREADS = BLOCK_DIM_X;
+
+    /// Layout type for padded thread block raking grid
+    using BlockRakingLayout = cub::BlockRakingLayout<T, BLOCK_THREADS>;
+    // The number of reduction elements is not a multiple of the number of raking threads for now
+    static_assert(BlockRakingLayout::UNGUARDED);
+
+    /// Number of raking threads
+    static constexpr int RAKING_THREADS = BlockRakingLayout::RAKING_THREADS;
+    /// Number of raking elements per warp synchronous raking thread
+    static constexpr int SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH;
+    /// Cooperative work can be entirely warp synchronous
+    static constexpr bool WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS));
+
+    ///  WarpReverseScan utility type
+    using WarpReverseScan = WarpReverseScan<T, RAKING_THREADS>;
+
+    /// Shared memory storage layout type
+    struct _TempStorage {
+        typename BlockRakingLayout::TempStorage raking_grid;     ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : cub::Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(ScanOp scan_op) {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+        // Read data into registers
+        #pragma unroll
+        for (int i = 0; i < SEGMENT_LENGTH; ++i) { cached_segment[i] = smem_raking_ptr[i]; }
+        T raking_partial = cached_segment[SEGMENT_LENGTH - 1];
+        #pragma unroll
+        for (int i = SEGMENT_LENGTH - 2; i >= 0; --i) {
+            raking_partial = scan_op(raking_partial, cached_segment[i]);
+        }
+        return raking_partial;
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+        // Read data back into registers
+        if (!MEMOIZE) {
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; ++i) { cached_segment[i] = smem_raking_ptr[i]; }
+        }
+        ThreadReverseScanExclusive(cached_segment, cached_segment, scan_op, raking_partial);
+        // Write data back to smem
+        #pragma unroll
+        for (int i = 0; i < SEGMENT_LENGTH; ++i) { smem_raking_ptr[i] = cached_segment[i]; }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockReverseScan(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(cub::RowMajorTid(BLOCK_DIM_X, 1, 1))
+    {}
+
+
+    /// Computes an exclusive thread block-wide postfix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_postfix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically postfixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPostfixCallbackOp>
+    __device__ __forceinline__ void ExclusiveReverseScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPostfixCallbackOp  &block_postfix_callback_op)     ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide postfix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS) {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpReverseScan warp_scan;
+            warp_scan.ExclusiveReverseScan(input, exclusive_output, scan_op, block_aggregate);
+            // Obtain warp-wide postfix in lane0, then broadcast to other lanes
+            T block_postfix = block_postfix_callback_op(block_aggregate);
+            block_postfix = warp_scan.Broadcast(block_postfix, 0);
+            exclusive_output = linear_tid == BLOCK_THREADS - 1 ? block_postfix : scan_op(block_postfix, exclusive_output);
+        } else {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            detail::uninitialized_copy(placement_ptr, input);
+            cub::CTA_SYNC();
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS) {
+                WarpReverseScan warp_scan;
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveReverseScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+                // Obtain block-wide postfix in lane0, then broadcast to other lanes
+                T block_postfix = block_postfix_callback_op(block_aggregate);
+                block_postfix = warp_scan.Broadcast(block_postfix, 0);
+                // Update postfix with warpscan exclusive partial
+                T downsweep_postfix = linear_tid == RAKING_THREADS - 1
+                    ? block_postfix : scan_op(block_postfix, exclusive_partial);
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_postfix);
+            }
+            cub::CTA_SYNC();
+            // Grab thread postfix from shared memory
+            exclusive_output = *placement_ptr;
+
+            // // Compute warp scan in each warp.
+            // // The exclusive output from the last lane in each warp is invalid.
+            // T inclusive_output;
+            // WarpReverseScan warp_scan;
+            // warp_scan.ReverseScan(input, inclusive_output, exclusive_output, scan_op);
+
+            // // Compute the warp-wide postfix and block-wide aggregate for each warp.  Warp postfix for the last warp is invalid.
+            // T block_aggregate;
+            // T warp_postfix = ComputeWarpPostfix(scan_op, inclusive_output, block_aggregate);
+
+            // // Apply warp postfix to our lane's partial
+            // if (warp_id != 0) {
+            //     exclusive_output = scan_op(warp_postfix, exclusive_output);
+            //     if (lane_id == 0) { exclusive_output = warp_postfix; }
+            // }
+
+            // // Use the first warp to determine the thread block postfix, returning the result in lane0
+            // if (warp_id == 0) {
+            //     T block_postfix = block_postfix_callback_op(block_aggregate);
+            //     if (lane_id == 0) {
+            //         // Share the postfix with all threads
+            //         detail::uninitialized_copy(&temp_storage.block_postfix,
+            //                                   block_postfix);
+
+            //         exclusive_output = block_postfix; // The block postfix is the exclusive output for tid0
+            //     }
+            // }
+
+            // cub::CTA_SYNC();
+
+            // // Incorporate thread block postfix into outputs
+            // T block_postfix = temp_storage.block_postfix;
+            // if (linear_tid > 0) { exclusive_output = scan_op(block_postfix, exclusive_output); }
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide postfix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_postfix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically postfixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPostfixCallbackOp>
+    __device__ __forceinline__ void InclusiveReverseScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPostfixCallbackOp   &block_postfix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide postfix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_postfix = ThreadReverseReduce(input, scan_op);
+        // Exclusive thread block-scan
+        ExclusiveReverseScan(thread_postfix, thread_postfix, scan_op, block_postfix_callback_op);
+        // Inclusive scan in registers with postfix as seed
+        ThreadReverseScanInclusive(input, output, scan_op, thread_postfix);
+    }
+
+};
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan.cpp
+++ b/csrc/selective_scan/selective_scan.cpp
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include "selective_scan.h"
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+#define DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)                    \
+    if (ITYPE == at::ScalarType::Half) {                                            \
+        using input_t = at::Half;                                                   \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
+        using input_t = at::BFloat16;                                               \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::Float)  {                                   \
+        using input_t = float;                                                      \
+        __VA_ARGS__();                                                              \
+    } else {                                                                        \
+        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
+    }
+
+#define DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(WTYPE, NAME, ...)                     \
+    if (WTYPE == at::ScalarType::Half) {                                             \
+        using weight_t = at::Half;                                                   \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::BFloat16) {                                  \
+        using weight_t = at::BFloat16;                                               \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::Float)  {                                    \
+        using weight_t = float;                                                      \
+        __VA_ARGS__();                                                               \
+    } else {                                                                         \
+        AT_ERROR(#NAME, " not implemented for weight type '", toString(WTYPE), "'"); \
+    }
+
+#define DISPATCH_WTYPE_FLOAT_AND_COMPLEX(WTYPE, NAME, ...)                           \
+    if (WTYPE == at::ScalarType::Float) {                                            \
+       using weight_t = float;                                                       \
+        __VA_ARGS__();                                                               \
+    } else if (WTYPE == at::ScalarType::ComplexFloat) {                              \
+        using weight_t = c10::complex<float>;                                        \
+        __VA_ARGS__();                                                               \
+    } else {                                                                         \
+        AT_ERROR(#NAME, " not implemented for weight type '", toString(WTYPE), "'"); \
+    }
+
+template<typename input_t, typename weight_t>
+void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
+
+template <typename input_t, typename weight_t>
+void selective_scan_bwd_cuda(SSMParamsBwd &params, cudaStream_t stream);
+
+void set_ssm_params_fwd(SSMParamsBase &params,
+                        // sizes
+                        const size_t batch,
+                        const size_t dim,
+                        const size_t seqlen,
+                        const size_t dstate,
+                        const size_t n_groups,
+                        const size_t n_chunks,
+                        const bool is_variable_B,
+                        const bool is_variable_C,
+                        // device pointers
+                        const at::Tensor u,
+                        const at::Tensor delta,
+                        const at::Tensor A,
+                        const at::Tensor B,
+                        const at::Tensor C,
+                        const at::Tensor out,
+                        const at::Tensor z,
+                        const at::Tensor out_z,
+                        void* D_ptr,
+                        void* delta_bias_ptr,
+                        void* x_ptr,
+                        bool has_z,
+                        bool delta_softplus) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.batch = batch;
+    params.dim = dim;
+    params.seqlen = seqlen;
+    params.dstate = dstate;
+    params.n_groups = n_groups;
+    params.n_chunks = n_chunks;
+    params.dim_ngroups_ratio = dim / n_groups;
+
+    params.delta_softplus = delta_softplus;
+
+    params.is_variable_B = is_variable_B;
+    params.is_variable_C = is_variable_C;
+
+    // Set the pointers and strides.
+    params.u_ptr = u.data_ptr();
+    params.delta_ptr = delta.data_ptr();
+    params.A_ptr = A.data_ptr();
+    params.B_ptr = B.data_ptr();
+    params.C_ptr = C.data_ptr();
+    params.D_ptr = D_ptr;
+    params.delta_bias_ptr = delta_bias_ptr;
+    params.out_ptr = out.data_ptr();
+    params.x_ptr = x_ptr;
+    params.z_ptr = has_z ? z.data_ptr() : nullptr;
+    params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
+    // All stride are in elements, not bytes.
+    params.A_d_stride = A.stride(0);
+    params.A_dstate_stride = A.stride(1);
+    if (!is_variable_B) {
+        params.B_d_stride = B.stride(0);
+    } else {
+        params.B_batch_stride = B.stride(0);
+        params.B_group_stride = B.stride(1);
+    }
+    params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
+    if (!is_variable_C) {
+        params.C_d_stride = C.stride(0);
+    } else {
+        params.C_batch_stride = C.stride(0);
+        params.C_group_stride = C.stride(1);
+    }
+    params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
+    params.u_batch_stride = u.stride(0);
+    params.u_d_stride = u.stride(1);
+    params.delta_batch_stride = delta.stride(0);
+    params.delta_d_stride = delta.stride(1);
+    if (has_z) {
+        params.z_batch_stride = z.stride(0);
+        params.z_d_stride = z.stride(1);
+        params.out_z_batch_stride = out_z.stride(0);
+        params.out_z_d_stride = out_z.stride(1);
+    }
+    params.out_batch_stride = out.stride(0);
+    params.out_d_stride = out.stride(1);
+}
+
+void set_ssm_params_bwd(SSMParamsBwd &params,
+                        // sizes
+                        const size_t batch,
+                        const size_t dim,
+                        const size_t seqlen,
+                        const size_t dstate,
+                        const size_t n_groups,
+                        const size_t n_chunks,
+                        const bool is_variable_B,
+                        const bool is_variable_C,
+                        // device pointers
+                        const at::Tensor u,
+                        const at::Tensor delta,
+                        const at::Tensor A,
+                        const at::Tensor B,
+                        const at::Tensor C,
+                        const at::Tensor z,
+                        const at::Tensor out,
+                        const at::Tensor out_z,
+                        void* D_ptr,
+                        void* delta_bias_ptr,
+                        void* x_ptr,
+                        const at::Tensor dout,
+                        const at::Tensor du,
+                        const at::Tensor ddelta,
+                        const at::Tensor dA,
+                        const at::Tensor dB,
+                        const at::Tensor dC,
+                        const at::Tensor dz,
+                        void* dD_ptr,
+                        void* ddelta_bias_ptr,
+                        bool has_z,
+                        bool delta_softplus,
+                        bool recompute_out_z) {
+    // Pass in "dout" instead of "out", we're not gonna use "out" unless we have z
+    set_ssm_params_fwd(params, batch, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
+                       u, delta, A, B, C, has_z ? out : dout,
+                       has_z ? z : dout,
+                       // If not recompute_out_z, pass dout instead of out_z.
+                       // This won't be used by the bwd kernel
+                       recompute_out_z ? out_z : dout,
+                       D_ptr, delta_bias_ptr, x_ptr, has_z, delta_softplus);
+    if (!recompute_out_z) { params.out_z_ptr = nullptr; }
+
+    // Set the pointers and strides.
+    params.dout_ptr = dout.data_ptr();
+    params.du_ptr = du.data_ptr();
+    params.dA_ptr = dA.data_ptr();
+    params.dB_ptr = dB.data_ptr();
+    params.dC_ptr = dC.data_ptr();
+    params.dD_ptr = dD_ptr;
+    params.ddelta_ptr = ddelta.data_ptr();
+    params.ddelta_bias_ptr = ddelta_bias_ptr;
+    params.dz_ptr = has_z ? dz.data_ptr() : nullptr;
+    // All stride are in elements, not bytes.
+    params.dout_batch_stride = dout.stride(0);
+    params.dout_d_stride = dout.stride(1);
+    params.dA_d_stride = dA.stride(0);
+    params.dA_dstate_stride = dA.stride(1);
+    if (!is_variable_B) {
+        params.dB_d_stride = dB.stride(0);
+    } else {
+        params.dB_batch_stride = dB.stride(0);
+        params.dB_group_stride = dB.stride(1);
+    }
+    params.dB_dstate_stride = !is_variable_B ? dB.stride(1) : dB.stride(2);
+    if (!is_variable_C) {
+        params.dC_d_stride = dC.stride(0);
+    } else {
+        params.dC_batch_stride = dC.stride(0);
+        params.dC_group_stride = dC.stride(1);
+    }
+    params.dC_dstate_stride = !is_variable_C ? dC.stride(1) : dC.stride(2);
+    params.du_batch_stride = du.stride(0);
+    params.du_d_stride = du.stride(1);
+    params.ddelta_batch_stride = ddelta.stride(0);
+    params.ddelta_d_stride = ddelta.stride(1);
+    if (has_z) {
+        params.dz_batch_stride = dz.stride(0);
+        params.dz_d_stride = dz.stride(1);
+    }
+}
+
+std::vector<at::Tensor>
+selective_scan_fwd(const at::Tensor &u, const at::Tensor &delta,
+                  const at::Tensor &A, const at::Tensor &B, const at::Tensor &C,
+                  const c10::optional<at::Tensor> &D_,
+                  const c10::optional<at::Tensor> &z_,
+                  const c10::optional<at::Tensor> &delta_bias_,
+                  bool delta_softplus) {
+    auto input_type = u.scalar_type();
+    auto weight_type = A.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::ComplexFloat);
+
+    const bool is_variable_B = B.dim() >= 3;
+    const bool is_variable_C = C.dim() >= 3;
+    const bool is_complex = weight_type == at::ScalarType::ComplexFloat;
+
+    TORCH_CHECK(delta.scalar_type() == input_type);
+    TORCH_CHECK(B.scalar_type() == (!is_variable_B ? weight_type : input_type));
+    TORCH_CHECK(C.scalar_type() == (!is_variable_C ? weight_type : input_type));
+
+    TORCH_CHECK(u.is_cuda());
+    TORCH_CHECK(delta.is_cuda());
+    TORCH_CHECK(A.is_cuda());
+    TORCH_CHECK(B.is_cuda());
+    TORCH_CHECK(C.is_cuda());
+
+    TORCH_CHECK(u.stride(-1) == 1 || u.size(-1) == 1);
+    TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
+
+    const auto sizes = u.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int dstate = A.size(1);
+    const int n_groups = is_variable_B ? B.size(1) : 1;
+
+    TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
+
+    CHECK_SHAPE(u, batch_size, dim, seqlen);
+    CHECK_SHAPE(delta, batch_size, dim, seqlen);
+    CHECK_SHAPE(A, dim, dstate);
+    if (!is_variable_B) {
+        CHECK_SHAPE(B, dim, dstate);
+    } else {
+        CHECK_SHAPE(B, batch_size, n_groups, dstate, !is_complex ? seqlen : seqlen * 2);
+        TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
+    }
+    if (!is_variable_C) {
+        CHECK_SHAPE(C, dim, dstate);
+    } else {
+        CHECK_SHAPE(C, batch_size, n_groups, dstate, !is_complex ? seqlen: seqlen * 2);
+        TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
+    }
+
+    if (D_.has_value()) {
+        auto D = D_.value();
+        TORCH_CHECK(D.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(D.is_cuda());
+        TORCH_CHECK(D.stride(-1) == 1 || D.size(-1) == 1);
+        CHECK_SHAPE(D, dim);
+    }
+
+    if (delta_bias_.has_value()) {
+        auto delta_bias = delta_bias_.value();
+        TORCH_CHECK(delta_bias.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(delta_bias.is_cuda());
+        TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
+        CHECK_SHAPE(delta_bias, dim);
+    }
+
+    at::Tensor z, out_z;
+    const bool has_z = z_.has_value();
+    if (has_z) {
+        z = z_.value();
+        TORCH_CHECK(z.scalar_type() == input_type);
+        TORCH_CHECK(z.is_cuda());
+        TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
+        CHECK_SHAPE(z, batch_size, dim, seqlen);
+        out_z = torch::empty_like(z);
+    }
+
+    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+    // const int n_chunks = (seqlen + 1024 - 1) / 1024;
+    // at::Tensor out = torch::empty_like(u);
+    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
+    at::Tensor out = torch::empty_like(delta);
+    at::Tensor x;
+    x = torch::empty({batch_size, dim, n_chunks, dstate * 2}, u.options().dtype(weight_type));
+
+    SSMParamsBase params;
+    set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
+                       u, delta, A, B, C, out, z, out_z,
+                       D_.has_value() ? D_.value().data_ptr() : nullptr,
+                       delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
+                       x.data_ptr(),
+                       has_z,
+                       delta_softplus);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)u.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
+        DISPATCH_WTYPE_FLOAT_AND_COMPLEX(A.scalar_type(), "selective_scan_fwd", [&] {
+            selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+        });
+    });
+    std::vector<at::Tensor> result = {out, x};
+    if (has_z) { result.push_back(out_z); }
+    return result;
+}
+
+std::vector<at::Tensor>
+selective_scan_bwd(const at::Tensor &u, const at::Tensor &delta,
+                  const at::Tensor &A, const at::Tensor &B, const at::Tensor &C,
+                  const c10::optional<at::Tensor> &D_,
+                  const c10::optional<at::Tensor> &z_,
+                  const c10::optional<at::Tensor> &delta_bias_,
+                  const at::Tensor &dout,
+                  const c10::optional<at::Tensor> &x_,
+                  const c10::optional<at::Tensor> &out_,
+                  c10::optional<at::Tensor> &dz_,
+                  bool delta_softplus,
+                  bool recompute_out_z) {
+    auto input_type = u.scalar_type();
+    auto weight_type = A.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::ComplexFloat);
+
+    const bool is_variable_B = B.dim() >= 3;
+    const bool is_variable_C = C.dim() >= 3;
+    const bool is_complex = weight_type == at::ScalarType::ComplexFloat;
+
+    TORCH_CHECK(delta.scalar_type() == input_type);
+    TORCH_CHECK(B.scalar_type() == (!is_variable_B ? weight_type : input_type));
+    TORCH_CHECK(C.scalar_type() == (!is_variable_C ? weight_type : input_type));
+    TORCH_CHECK(dout.scalar_type() == input_type);
+
+    TORCH_CHECK(u.is_cuda());
+    TORCH_CHECK(delta.is_cuda());
+    TORCH_CHECK(A.is_cuda());
+    TORCH_CHECK(B.is_cuda());
+    TORCH_CHECK(C.is_cuda());
+    TORCH_CHECK(dout.is_cuda());
+
+    TORCH_CHECK(u.stride(-1) == 1 || u.size(-1) == 1);
+    TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
+    TORCH_CHECK(dout.stride(-1) == 1 || dout.size(-1) == 1);
+
+    const auto sizes = u.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int dstate = A.size(1);
+    const int n_groups = is_variable_B ? B.size(1) : 1;
+
+    TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
+
+    CHECK_SHAPE(u, batch_size, dim, seqlen);
+    CHECK_SHAPE(delta, batch_size, dim, seqlen);
+    CHECK_SHAPE(A, dim, dstate);
+    if (!is_variable_B) {
+        CHECK_SHAPE(B, dim, dstate);
+    } else {
+        CHECK_SHAPE(B, batch_size, n_groups, dstate, !is_complex ? seqlen : seqlen * 2);
+        TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
+    }
+    if (!is_variable_C) {
+        CHECK_SHAPE(C, dim, dstate);
+    } else {
+        CHECK_SHAPE(C, batch_size, n_groups, dstate, !is_complex ? seqlen: seqlen * 2);
+        TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
+    }
+    CHECK_SHAPE(dout, batch_size, dim, seqlen);
+
+    if (D_.has_value()) {
+        auto D = D_.value();
+        TORCH_CHECK(D.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(D.is_cuda());
+        TORCH_CHECK(D.stride(-1) == 1 || D.size(-1) == 1);
+        CHECK_SHAPE(D, dim);
+    }
+
+    if (delta_bias_.has_value()) {
+        auto delta_bias = delta_bias_.value();
+        TORCH_CHECK(delta_bias.scalar_type() == at::ScalarType::Float);
+        TORCH_CHECK(delta_bias.is_cuda());
+        TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
+        CHECK_SHAPE(delta_bias, dim);
+    }
+
+    at::Tensor z, out, dz, out_z;
+    const bool has_z = z_.has_value();
+    if (has_z) {
+        z = z_.value();
+        TORCH_CHECK(z.scalar_type() == input_type);
+        TORCH_CHECK(z.is_cuda());
+        TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
+        CHECK_SHAPE(z, batch_size, dim, seqlen);
+
+        TORCH_CHECK(out_.has_value());
+        out = out_.value();
+        TORCH_CHECK(out.scalar_type() == input_type);
+        TORCH_CHECK(out.is_cuda());
+        TORCH_CHECK(out.stride(-1) == 1 || out.size(-1) == 1);
+        CHECK_SHAPE(out, batch_size, dim, seqlen);
+
+        if (dz_.has_value()) {
+            dz = dz_.value();
+            TORCH_CHECK(dz.scalar_type() == input_type);
+            TORCH_CHECK(dz.is_cuda());
+            TORCH_CHECK(dz.stride(-1) == 1 || dz.size(-1) == 1);
+            CHECK_SHAPE(dz, batch_size, dim, seqlen);
+        } else {
+            dz = torch::empty_like(z);
+        }
+        if (recompute_out_z) {
+            out_z = torch::empty_like(out);
+        }
+    }
+
+    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+    // const int n_chunks = (seqlen + 1024 - 1) / 1024;
+    if (n_chunks > 1) { TORCH_CHECK(x_.has_value()); }
+    if (x_.has_value()) {
+        auto x = x_.value();
+        TORCH_CHECK(x.scalar_type() == weight_type);
+        TORCH_CHECK(x.is_cuda());
+        TORCH_CHECK(x.is_contiguous());
+        CHECK_SHAPE(x, batch_size, dim, n_chunks, 2 * dstate);
+    }
+
+    at::Tensor du = torch::empty_like(u);
+    at::Tensor ddelta = torch::empty_like(delta);
+    at::Tensor dA = torch::zeros_like(A);
+    at::Tensor dB = !is_variable_B ? torch::zeros_like(B) : torch::zeros_like(B, B.options().dtype(torch::kFloat32));
+    at::Tensor dC = !is_variable_C ? torch::zeros_like(C) : torch::zeros_like(C, C.options().dtype(torch::kFloat32));
+    at::Tensor dD;
+    if (D_.has_value()) { dD = torch::zeros_like(D_.value()); }
+    at::Tensor ddelta_bias;
+    if (delta_bias_.has_value()) { ddelta_bias = torch::zeros_like(delta_bias_.value()); }
+
+    SSMParamsBwd params;
+    set_ssm_params_bwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
+                       u, delta, A, B, C, z, out, out_z,
+                       D_.has_value() ? D_.value().data_ptr() : nullptr,
+                       delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
+                       x_.has_value() ? x_.value().data_ptr() : nullptr,
+                       dout, du, ddelta, dA, dB, dC, dz,
+                       D_.has_value() ? dD.data_ptr() : nullptr,
+                       delta_bias_.has_value() ? ddelta_bias.data_ptr() : nullptr,
+                       has_z, delta_softplus, recompute_out_z);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)u.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_bwd", [&] {
+        DISPATCH_WTYPE_FLOAT_AND_COMPLEX(A.scalar_type(), "selective_scan_bwd", [&] {
+            selective_scan_bwd_cuda<input_t, weight_t>(params, stream);
+        });
+    });
+    std::vector<at::Tensor> result = {du, ddelta, dA, dB.to(B.dtype()), dC.to(C.dtype()), dD, ddelta_bias};
+    if (has_z) { result.push_back(dz); }
+    if (recompute_out_z) { result.push_back(out_z); }
+    return result;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("fwd", &selective_scan_fwd, "Selective scan forward");
+    m.def("bwd", &selective_scan_bwd, "Selective scan backward");
+}
--- a/csrc/selective_scan/selective_scan.h
+++ b/csrc/selective_scan/selective_scan.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SSMScanParamsBase {
+    using index_t = uint32_t;
+
+    int batch, seqlen, n_chunks;
+    index_t a_batch_stride;
+    index_t b_batch_stride;
+    index_t out_batch_stride;
+
+    // Common data pointers.
+    void *__restrict__ a_ptr;
+    void *__restrict__ b_ptr;
+    void *__restrict__ out_ptr;
+    void *__restrict__ x_ptr;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SSMParamsBase {
+    using index_t = uint32_t;
+
+    int batch, dim, seqlen, dstate, n_groups, n_chunks;
+    int dim_ngroups_ratio;
+    bool is_variable_B;
+    bool is_variable_C;
+
+    bool delta_softplus;
+
+    index_t A_d_stride;
+    index_t A_dstate_stride;
+    index_t B_batch_stride;
+    index_t B_d_stride;
+    index_t B_dstate_stride;
+    index_t B_group_stride;
+    index_t C_batch_stride;
+    index_t C_d_stride;
+    index_t C_dstate_stride;
+    index_t C_group_stride;
+    index_t u_batch_stride;
+    index_t u_d_stride;
+    index_t delta_batch_stride;
+    index_t delta_d_stride;
+    index_t z_batch_stride;
+    index_t z_d_stride;
+    index_t out_batch_stride;
+    index_t out_d_stride;
+    index_t out_z_batch_stride;
+    index_t out_z_d_stride;
+
+    // Common data pointers.
+    void *__restrict__ A_ptr;
+    void *__restrict__ B_ptr;
+    void *__restrict__ C_ptr;
+    void *__restrict__ D_ptr;
+    void *__restrict__ u_ptr;
+    void *__restrict__ delta_ptr;
+    void *__restrict__ delta_bias_ptr;
+    void *__restrict__ out_ptr;
+    void *__restrict__ x_ptr;
+    void *__restrict__ z_ptr;
+    void *__restrict__ out_z_ptr;
+};
+
+struct SSMParamsBwd: public SSMParamsBase {
+    index_t dout_batch_stride;
+    index_t dout_d_stride;
+    index_t dA_d_stride;
+    index_t dA_dstate_stride;
+    index_t dB_batch_stride;
+    index_t dB_group_stride;
+    index_t dB_d_stride;
+    index_t dB_dstate_stride;
+    index_t dC_batch_stride;
+    index_t dC_group_stride;
+    index_t dC_d_stride;
+    index_t dC_dstate_stride;
+    index_t du_batch_stride;
+    index_t du_d_stride;
+    index_t dz_batch_stride;
+    index_t dz_d_stride;
+    index_t ddelta_batch_stride;
+    index_t ddelta_d_stride;
+
+    // Common data pointers.
+    void *__restrict__ dout_ptr;
+    void *__restrict__ dA_ptr;
+    void *__restrict__ dB_ptr;
+    void *__restrict__ dC_ptr;
+    void *__restrict__ dD_ptr;
+    void *__restrict__ du_ptr;
+    void *__restrict__ dz_ptr;
+    void *__restrict__ ddelta_ptr;
+    void *__restrict__ ddelta_bias_ptr;
+};
--- a/csrc/selective_scan/selective_scan_bwd_bf16_complex.cu
+++ b/csrc/selective_scan/selective_scan_bwd_bf16_complex.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::BFloat16, complex_t>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_bwd_bf16_real.cu
+++ b/csrc/selective_scan/selective_scan_bwd_bf16_real.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::BFloat16, float>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_bwd_fp16_complex.cu
+++ b/csrc/selective_scan/selective_scan_bwd_fp16_complex.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::Half, complex_t>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_bwd_fp16_real.cu
+++ b/csrc/selective_scan/selective_scan_bwd_fp16_real.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<at::Half, float>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_bwd_fp32_complex.cu
+++ b/csrc/selective_scan/selective_scan_bwd_fp32_complex.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<float, complex_t>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_bwd_fp32_real.cu
+++ b/csrc/selective_scan/selective_scan_bwd_fp32_real.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_bwd_kernel.cuh"
+
+template void selective_scan_bwd_cuda<float, float>(SSMParamsBwd &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_bwd_kernel.cuh
+++ b/csrc/selective_scan/selective_scan_bwd_kernel.cuh
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+#include <ATen/cuda/Atomic.cuh>  // For atomicAdd on complex
+
+#ifndef USE_ROCM
+    #include <cub/block/block_load.cuh>
+    #include <cub/block/block_store.cuh>
+    #include <cub/block/block_scan.cuh>
+    #include <cub/block/block_reduce.cuh>
+#else
+    #include <hipcub/hipcub.hpp>
+    namespace cub = hipcub;
+#endif
+
+#include "selective_scan.h"
+#include "selective_scan_common.h"
+#include "reverse_scan.cuh"
+#include "static_switch.h"
+
+template<typename scalar_t> __device__ __forceinline__ scalar_t conj(scalar_t x);
+template<> __device__ __forceinline__ float conj<float>(float x) { return x; }
+template<> __device__ __forceinline__ complex_t conj<complex_t>(complex_t x) { return std::conj(x); }
+
+template<int kNThreads_, int kNItems_, bool kIsEvenLen_, bool kIsVariableB_, bool kIsVariableC_,
+         bool kDeltaSoftplus_, bool kHasZ_, typename input_t_, typename weight_t_>
+struct Selective_Scan_bwd_kernel_traits {
+    static_assert(kNItems_ % 4 == 0);
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kNItems = kNItems_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
+    static_assert(kNItems % kNElts == 0);
+    static constexpr int kNLoads = kNItems / kNElts;
+    static constexpr bool kIsComplex = std::is_same_v<weight_t, complex_t>;
+    static constexpr bool kIsEvenLen = kIsEvenLen_;
+    static constexpr bool kIsVariableB = kIsVariableB_;
+    static constexpr bool kIsVariableC = kIsVariableC_;
+    static constexpr bool kDeltaSoftplus = kDeltaSoftplus_;
+    static constexpr bool kHasZ = kHasZ_;
+    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads with float improves occupancy.
+    // For complex this would lead to massive register spilling, so we keep it at 2.
+    static constexpr int kMinBlocks = kNThreads == 128 && !kIsComplex ? 3 : 2;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using scan_t = std::conditional_t<!kIsComplex, float2, float4>;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, !kIsComplex ? kNItems : kNItems * 2, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, !kIsComplex ? kNLoads : kNLoads * 2, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, kNLoads, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING_MEMOIZE>;
+    using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
+    using BlockReverseScanT = BlockReverseScan<scan_t, kNThreads>;
+    using BlockReduceT = cub::BlockReduce<scan_t, kNThreads>;
+    using BlockReduceFloatT = cub::BlockReduce<float, kNThreads>;
+    using BlockReduceComplexT = cub::BlockReduce<complex_t, kNThreads>;
+    using BlockExchangeT = cub::BlockExchange<float, kNThreads, !kIsComplex ? kNItems : kNItems * 2>;
+
+    static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
+                                                    sizeof(typename BlockLoadVecT::TempStorage),
+                                                    (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
+                                                    (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
+                                                    sizeof(typename BlockStoreT::TempStorage),
+                                                    sizeof(typename BlockStoreVecT::TempStorage)});
+    static constexpr int kSmemExchangeSize = (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockExchangeT::TempStorage);
+    static constexpr int kSmemReduceSize = sizeof(typename BlockReduceT::TempStorage);
+    static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize + kSmemReduceSize + sizeof(typename BlockScanT::TempStorage) + sizeof(typename BlockReverseScanT::TempStorage);
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads, Ktraits::kMinBlocks)
+void selective_scan_bwd_kernel(SSMParamsBwd params) {
+    constexpr bool kIsComplex = Ktraits::kIsComplex;
+    constexpr bool kIsVariableB = Ktraits::kIsVariableB;
+    constexpr bool kIsVariableC = Ktraits::kIsVariableC;
+    constexpr bool kDeltaSoftplus = Ktraits::kDeltaSoftplus;
+    constexpr bool kHasZ = Ktraits::kHasZ;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNItems = Ktraits::kNItems;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+    using scan_t = typename Ktraits::scan_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    // cast to lvalue reference of expected type
+    // char *smem_loadstorescan = smem_ + 2 * MAX_DSTATE * sizeof(weight_t);
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_ + 2 * MAX_DSTATE * sizeof(weight_t));
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
+    auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_exchange = *reinterpret_cast<typename Ktraits::BlockExchangeT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
+    auto& smem_exchange1 = *reinterpret_cast<typename Ktraits::BlockExchangeT::TempStorage*>(smem_ + Ktraits::kSmemIOSize + sizeof(typename Ktraits::BlockExchangeT::TempStorage));
+    auto& smem_reduce = *reinterpret_cast<typename Ktraits::BlockReduceT::TempStorage*>(reinterpret_cast<char *>(&smem_exchange) + Ktraits::kSmemExchangeSize);
+    auto& smem_reduce_float = *reinterpret_cast<typename Ktraits::BlockReduceFloatT::TempStorage*>(&smem_reduce);
+    auto& smem_reduce_complex = *reinterpret_cast<typename Ktraits::BlockReduceComplexT::TempStorage*>(&smem_reduce);
+    auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(reinterpret_cast<char *>(&smem_reduce) + Ktraits::kSmemReduceSize);
+    auto& smem_reverse_scan = *reinterpret_cast<typename Ktraits::BlockReverseScanT::TempStorage*>(reinterpret_cast<char *>(&smem_scan) + sizeof(typename Ktraits::BlockScanT::TempStorage));
+    weight_t *smem_delta_a = reinterpret_cast<weight_t *>(smem_ + Ktraits::kSmemSize);
+    scan_t *smem_running_postfix = reinterpret_cast<scan_t *>(smem_delta_a + 2 * MAX_DSTATE + kNThreads);
+    weight_t *smem_da = reinterpret_cast<weight_t *>(smem_running_postfix + MAX_DSTATE);
+    weight_t *smem_dbc = reinterpret_cast<weight_t *>(smem_da + MAX_DSTATE);
+
+    const int batch_id = blockIdx.x;
+    const int dim_id = blockIdx.y;
+    const int group_id = dim_id / (params.dim_ngroups_ratio);
+    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride
+        + dim_id * params.u_d_stride;
+    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride
+        + dim_id * params.delta_d_stride;
+    input_t *dout = reinterpret_cast<input_t *>(params.dout_ptr) + batch_id * params.dout_batch_stride
+        + dim_id * params.dout_d_stride;
+    weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * params.A_d_stride;
+    weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * params.B_d_stride;
+    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride;
+    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * params.C_d_stride;
+    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
+    weight_t *dA = reinterpret_cast<weight_t *>(params.dA_ptr) + dim_id * params.dA_d_stride;
+    weight_t *dB = reinterpret_cast<weight_t *>(params.dB_ptr)
+        + (!kIsVariableB ? dim_id * params.dB_d_stride : batch_id * (!kIsComplex ? params.dB_batch_stride : params.dB_batch_stride / 2) + group_id * params.dB_group_stride);
+    weight_t *dC = reinterpret_cast<weight_t *>(params.dC_ptr)
+        + (!kIsVariableC ? dim_id * params.dC_d_stride : batch_id * (!kIsComplex ? params.dC_batch_stride : params.dC_batch_stride / 2) + group_id * params.dC_group_stride);
+    float *dD = params.dD_ptr == nullptr ? nullptr : reinterpret_cast<float *>(params.dD_ptr) + dim_id;
+    float D_val = params.D_ptr == nullptr ? 0 : reinterpret_cast<float *>(params.D_ptr)[dim_id];
+    float *ddelta_bias = params.ddelta_bias_ptr == nullptr ? nullptr : reinterpret_cast<float *>(params.ddelta_bias_ptr) + dim_id;
+    float delta_bias = params.delta_bias_ptr == nullptr ? 0 : reinterpret_cast<float *>(params.delta_bias_ptr)[dim_id];
+    scan_t *x = params.x_ptr == nullptr
+        ? nullptr
+        : reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id) * (params.n_chunks) * params.dstate;
+    float dD_val = 0;
+    float ddelta_bias_val = 0;
+
+    constexpr int kChunkSize = kNThreads * kNItems;
+    u += (params.n_chunks - 1) * kChunkSize;
+    delta += (params.n_chunks - 1) * kChunkSize;
+    dout += (params.n_chunks - 1) * kChunkSize;
+    Bvar += (params.n_chunks - 1) * kChunkSize * (!kIsComplex ? 1 : 2);
+    Cvar += (params.n_chunks - 1) * kChunkSize * (!kIsComplex ? 1 : 2);
+    for (int chunk = params.n_chunks - 1; chunk >= 0; --chunk) {
+        input_t u_vals[kNItems];
+        input_t delta_vals_load[kNItems];
+        input_t dout_vals_load[kNItems];
+        __syncthreads();
+        load_input<Ktraits>(u, u_vals, smem_load, params.seqlen - chunk * kChunkSize);
+        u -= kChunkSize;
+        __syncthreads();
+        load_input<Ktraits>(delta, delta_vals_load, smem_load, params.seqlen - chunk * kChunkSize);
+        // Will reload delta at the same location if kDeltaSoftplus
+        if constexpr (!kDeltaSoftplus) { delta -= kChunkSize; }
+        __syncthreads();
+        load_input<Ktraits>(dout, dout_vals_load, smem_load, params.seqlen - chunk * kChunkSize);
+        dout -= kChunkSize;
+
+        float dout_vals[kNItems], delta_vals[kNItems];
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) {
+            dout_vals[i] = float(dout_vals_load[i]);
+            delta_vals[i] = float(delta_vals_load[i]) + delta_bias;
+            if constexpr (kDeltaSoftplus) {
+                delta_vals[i] = delta_vals[i] <= 20.f ? log1pf(expf(delta_vals[i])) : delta_vals[i];
+            }
+        }
+
+        if constexpr (kHasZ) {
+            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride
+                + dim_id * params.z_d_stride + chunk * kChunkSize;
+            input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+                + dim_id * params.out_d_stride + chunk * kChunkSize;
+            input_t *dz = reinterpret_cast<input_t *>(params.dz_ptr) + batch_id * params.dz_batch_stride
+                + dim_id * params.dz_d_stride + chunk * kChunkSize;
+            input_t z_vals[kNItems], out_vals[kNItems];
+            __syncthreads();
+            load_input<Ktraits>(z, z_vals, smem_load, params.seqlen - chunk * kChunkSize);
+            __syncthreads();
+            load_input<Ktraits>(out, out_vals, smem_load, params.seqlen - chunk * kChunkSize);
+            float dz_vals[kNItems], z_silu_vals[kNItems];
+            #pragma unroll
+            for (int i = 0; i < kNItems; ++i) {
+                float z_val = z_vals[i];
+                float z_sigmoid_val = 1.0f / (1.0f + expf(-z_val));
+                z_silu_vals[i] = z_val * z_sigmoid_val;
+                dz_vals[i] = dout_vals[i] * float(out_vals[i]) * z_sigmoid_val
+                             * (1.0f + z_val * (1.0f - z_sigmoid_val));
+                dout_vals[i] *= z_silu_vals[i];
+            }
+            __syncthreads();
+            store_output<Ktraits>(dz, dz_vals, smem_store, params.seqlen - chunk * kChunkSize);
+            if (params.out_z_ptr != nullptr) {  // Recompute and store out_z
+                float out_z_vals[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) { out_z_vals[i] = float(out_vals[i]) * z_silu_vals[i]; }
+                // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0) {
+                    // printf("out_val=%f, z_silu_val = %f, out_z_val = %f\n", float(out_vals[0]), z_silu_vals[0], out_z_vals[0]);
+                // }
+                input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride
+                    + dim_id * params.out_z_d_stride + chunk * kChunkSize;
+                __syncthreads();
+                store_output<Ktraits>(out_z, out_z_vals, smem_store, params.seqlen - chunk * kChunkSize);
+            }
+        }
+
+        float du_vals[kNItems];
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) { du_vals[i] = D_val * dout_vals[i]; }
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) { dD_val += dout_vals[i] * float(u_vals[i]); }
+
+        float ddelta_vals[kNItems] = {0};
+        __syncthreads();
+        for (int state_idx = 0; state_idx < params.dstate; ++state_idx) {
+            const weight_t A_val = A[state_idx * params.A_dstate_stride];
+            // Multiply the real part of A with LOG2E so we can use exp2f instead of expf.
+            weight_t A_scaled;
+            constexpr float kLog2e = M_LOG2E;
+            if constexpr (!kIsComplex) {
+                A_scaled = A_val * kLog2e;
+            } else {
+                A_scaled = complex_t(A_val.real_ * kLog2e, A_val.imag_);
+            }
+            weight_t B_val, C_val;
+            weight_t B_vals[kNItems], C_vals[kNItems];
+            if constexpr (!kIsVariableB) {
+                B_val = B[state_idx * params.B_dstate_stride];
+            } else {
+                load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
+                    smem_load_weight, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+            }
+            if constexpr (!kIsVariableC) {
+                C_val = C[state_idx * params.C_dstate_stride];
+            } else {
+                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
+                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
+                    smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+            }
+            // const weight_t A_val = smem_a[state_idx];
+            scan_t thread_data[kNItems], thread_reverse_data[kNItems];
+            if constexpr (!kIsComplex) {
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    const float delta_a_exp = exp2f(delta_vals[i] * A_scaled);
+                    thread_data[i] = make_float2(delta_a_exp, !kIsVariableB ? delta_vals[i] * float(u_vals[i]) : delta_vals[i] * float(u_vals[i]) * B_vals[i]);
+                    if (i == 0) {
+                        smem_delta_a[threadIdx.x == 0 ? state_idx + (chunk % 2) * MAX_DSTATE : threadIdx.x + 2 * MAX_DSTATE] = delta_a_exp;
+                    } else {
+                        thread_reverse_data[i - 1].x = delta_a_exp;
+                    }
+                    thread_reverse_data[i].y = dout_vals[i] *
+                        (!kIsVariableC
+                         ? (!kIsVariableB ? B_val * C_val : C_val)
+                         : (!kIsVariableB ? B_val * C_vals[i] : C_vals[i]));
+                }
+                __syncthreads();
+                thread_reverse_data[kNItems - 1].x = threadIdx.x == kNThreads - 1
+                    ? (chunk == params.n_chunks - 1 ? 1.f : smem_delta_a[state_idx + ((chunk + 1) % 2) * MAX_DSTATE])
+                    : smem_delta_a[threadIdx.x + 1 + 2 * MAX_DSTATE];
+                // Initialize running total
+                scan_t running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? x[(chunk - 1) * params.dstate + state_idx] : make_float2(1.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
+                typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
+                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
+                );
+                scan_t running_postfix = chunk < params.n_chunks - 1 && threadIdx.x % 32 == 0 ? smem_running_postfix[state_idx] : make_float2(1.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> postfix_op(running_postfix);
+                typename Ktraits::BlockReverseScanT(smem_reverse_scan).InclusiveReverseScan(
+                    thread_reverse_data, thread_reverse_data, SSMScanOp<weight_t>(), postfix_op
+                );
+                if (threadIdx.x == 0) { smem_running_postfix[state_idx] = postfix_op.running_prefix; }
+                weight_t dA_val = 0, dBC_val = 0;
+                weight_t dB_vals[kNItems], dC_vals[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    const float dx = thread_reverse_data[i].y;
+                    const float ddelta_u = !kIsVariableB ? dx : dx * B_vals[i];
+                    du_vals[i] += ddelta_u * delta_vals[i];
+                    const float a = thread_data[i].y - (!kIsVariableB ? delta_vals[i] * float(u_vals[i]) : delta_vals[i] * float(u_vals[i]) * B_vals[i]);
+                    ddelta_vals[i] += ddelta_u * float(u_vals[i]) + dx * A_val * a;
+                    dA_val += dx * delta_vals[i] * a;
+                    if constexpr (!kIsVariableB || !kIsVariableC) {
+                        if constexpr (!kIsVariableB) {  // dBC_val is dB_val
+                            dBC_val += dout_vals[i] * (!kIsVariableC ? thread_data[i].y : thread_data[i].y * C_vals[i]);
+                        } else {  // dBC_val is dC_val
+                            dBC_val += dout_vals[i] * thread_data[i].y;
+                        }
+                    }
+                    if constexpr (kIsVariableB) { dB_vals[i] = dx * delta_vals[i] * float(u_vals[i]); }
+                    if constexpr (kIsVariableC) {
+                        dC_vals[i] = dout_vals[i] * (!kIsVariableB ? thread_data[i].y * B_val : thread_data[i].y);
+                    }
+                }
+                // Block-exchange to make the atomicAdd's coalesced, otherwise they're much slower
+                if constexpr (kIsVariableB || kIsVariableC) {
+                    if constexpr (kIsVariableB) {
+                        typename Ktraits::BlockExchangeT(smem_exchange).BlockedToStriped(dB_vals, dB_vals);
+                    }
+                    if constexpr (kIsVariableC) {
+                        auto &smem_exchange_C = !kIsVariableB ? smem_exchange : smem_exchange1;
+                        typename Ktraits::BlockExchangeT(smem_exchange_C).BlockedToStriped(dC_vals, dC_vals);
+                    }
+                    const int seqlen_remaining = params.seqlen - chunk * kChunkSize - threadIdx.x;
+                    weight_t *dB_cur = dB + state_idx * params.dB_dstate_stride + chunk * kChunkSize + threadIdx.x;
+                    weight_t *dC_cur = dC + state_idx * params.dC_dstate_stride + chunk * kChunkSize + threadIdx.x;
+                    #pragma unroll
+                    for (int i = 0; i < kNItems; ++i) {
+                        if (i * kNThreads < seqlen_remaining) {
+                            if constexpr (kIsVariableB) { gpuAtomicAdd(dB_cur + i * kNThreads, dB_vals[i]); }
+                            if constexpr (kIsVariableC) { gpuAtomicAdd(dC_cur + i * kNThreads, dC_vals[i]); }
+                        }
+                    }
+                }
+                if constexpr (!kIsVariableB || !kIsVariableC) {
+                    float2 dA_dBC_val = make_float2(dA_val, dBC_val);
+                    dA_dBC_val = typename Ktraits::BlockReduceT(smem_reduce).Sum(dA_dBC_val);
+                    dA_val = dA_dBC_val.x;
+                    if (threadIdx.x == 0) {
+                        smem_dbc[state_idx] = chunk == params.n_chunks - 1 ? dA_dBC_val.y : dA_dBC_val.y + smem_dbc[state_idx];
+                    }
+                } else {
+                    dA_val = typename Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dA_val);
+                }
+                if (threadIdx.x == 0) {
+                    smem_da[state_idx] = chunk == params.n_chunks - 1 ? dA_val : dA_val + smem_da[state_idx];
+                }
+            } else {
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    // Pytorch's implementation of complex exp (which calls thrust) is very slow
+                    complex_t delta_a_exp = cexp2f(delta_vals[i] * A_scaled);
+                    weight_t B_delta_u_val = !kIsVariableB ? delta_vals[i] * float(u_vals[i]) : B_vals[i] * delta_vals[i] * float(u_vals[i]);
+                    thread_data[i] = make_float4(delta_a_exp.real_, delta_a_exp.imag_, B_delta_u_val.real_, B_delta_u_val.imag_);
+                    if (i == 0) {
+                        smem_delta_a[threadIdx.x == 0 ? state_idx + (chunk % 2) * MAX_DSTATE : threadIdx.x + 2 * MAX_DSTATE] = delta_a_exp;
+                    } else {
+                        thread_reverse_data[i - 1].x = delta_a_exp.real_;
+                        thread_reverse_data[i - 1].y = -delta_a_exp.imag_;
+                    }
+                    complex_t dout_BC = 2 * dout_vals[i]
+                        * conj(!kIsVariableC
+                                ? (!kIsVariableB ? B_val * C_val : C_val)
+                                : (!kIsVariableB ? B_val * C_vals[i] : C_vals[i]));
+                    thread_reverse_data[i].z = dout_BC.real_;
+                    thread_reverse_data[i].w = dout_BC.imag_;
+                }
+                __syncthreads();
+                complex_t delta_a_exp = threadIdx.x == kNThreads - 1
+                    ? (chunk == params.n_chunks - 1 ? 1.f : smem_delta_a[state_idx + ((chunk + 1) % 2) * MAX_DSTATE])
+                    : smem_delta_a[threadIdx.x + 1 + 2 * MAX_DSTATE];
+                thread_reverse_data[kNItems - 1].x = delta_a_exp.real_;
+                thread_reverse_data[kNItems - 1].y = -delta_a_exp.imag_;
+                // Initialize running total
+                scan_t running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? x[(chunk - 1) * params.dstate + state_idx] : make_float4(1.f, 0.f, 0.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
+                typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
+                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
+                );
+                scan_t running_postfix = chunk < params.n_chunks - 1 && threadIdx.x % 32 == 0 ? smem_running_postfix[state_idx] : make_float4(1.f, 0.f, 0.f, 0.f);
+                SSMScanPrefixCallbackOp<weight_t> postfix_op(running_postfix);
+                typename Ktraits::BlockReverseScanT(smem_reverse_scan).InclusiveReverseScan(
+                    thread_reverse_data, thread_reverse_data, SSMScanOp<weight_t>(), postfix_op
+                );
+                if (threadIdx.x == 0) { smem_running_postfix[state_idx] = postfix_op.running_prefix; }
+                weight_t dA_val = 0, dBC_val = 0;
+                weight_t dB_vals[kNItems], dC_vals[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    complex_t x = complex_t(thread_data[i].z, thread_data[i].w);
+                    complex_t dx = complex_t(thread_reverse_data[i].z, thread_reverse_data[i].w);
+                    float ddelta_u = !kIsVariableB ? dx.real_ : (dx * conj(B_vals[i])).real_;
+                    if constexpr (!kIsVariableB || !kIsVariableC) {
+                        if constexpr (!kIsVariableB) {  // dBC_val is dB_val
+                            dBC_val += (2 * dout_vals[i]) * conj(!kIsVariableC ? x : x * C_vals[i]);
+                        } else {  // dBC_val is dC_val
+                            dBC_val += (2 * dout_vals[i]) * conj(x);
+                        }
+                    }
+                    const complex_t a_conj = conj(x - (!kIsVariableB ? delta_vals[i] * float(u_vals[i]) : delta_vals[i] * float(u_vals[i]) * B_vals[i]));
+                    du_vals[i] += ddelta_u * delta_vals[i];
+                    ddelta_vals[i] += ddelta_u * float(u_vals[i]) + (dx * conj(A_val) * a_conj).real_;
+                    dA_val += delta_vals[i] * dx * a_conj;
+                    if constexpr (kIsVariableB) { dB_vals[i] = dx * delta_vals[i] * float(u_vals[i]); }
+                    if constexpr (kIsVariableC) {
+                        dC_vals[i] = (2 * dout_vals[i]) * conj(!kIsVariableB ? x * B_val : x);
+                    }
+                }
+                // Block-exchange to make the atomicAdd's coalesced, otherwise they're much slower
+                if constexpr (kIsVariableB || kIsVariableC) {
+                    float dB_vals_f[kNItems * 2], dC_vals_f[kNItems * 2];
+                    if constexpr (kIsVariableB) {
+                        #pragma unroll
+                        for (int i = 0; i < kNItems; ++i) {
+                            dB_vals_f[i * 2] = dB_vals[i].real_;
+                            dB_vals_f[i * 2 + 1] = dB_vals[i].imag_;
+                        }
+                        typename Ktraits::BlockExchangeT(smem_exchange).BlockedToStriped(dB_vals_f, dB_vals_f);
+                    }
+                    if constexpr (kIsVariableC) {
+                        #pragma unroll
+                        for (int i = 0; i < kNItems; ++i) {
+                            dC_vals_f[i * 2] = dC_vals[i].real_;
+                            dC_vals_f[i * 2 + 1] = dC_vals[i].imag_;
+                        }
+                        auto &smem_exchange_C = !kIsVariableB ? smem_exchange : smem_exchange1;
+                        typename Ktraits::BlockExchangeT(smem_exchange_C).BlockedToStriped(dC_vals_f, dC_vals_f);
+                    }
+                    const int seqlen_remaining = (params.seqlen - chunk * kChunkSize) * 2 - threadIdx.x;
+                    float *dB_cur = reinterpret_cast<float *>(dB) + state_idx * params.dB_dstate_stride + chunk * kChunkSize * 2 + threadIdx.x;
+                    float *dC_cur = reinterpret_cast<float *>(dC) + state_idx * params.dC_dstate_stride + chunk * kChunkSize * 2 + threadIdx.x;
+                    #pragma unroll
+                    for (int i = 0; i < kNItems * 2; ++i) {
+                        if (i * kNThreads < seqlen_remaining) {
+                            if constexpr (kIsVariableB) { gpuAtomicAdd(dB_cur + i * kNThreads, dB_vals_f[i]); }
+                            if constexpr (kIsVariableC) { gpuAtomicAdd(dC_cur + i * kNThreads, dC_vals_f[i]); }
+                        }
+                    }
+                }
+                if constexpr (!kIsVariableB || !kIsVariableC) {
+                    float4 dA_dBC_val = make_float4(dA_val.real_, dA_val.imag_, dBC_val.real_, dBC_val.imag_);
+                    dA_dBC_val = typename Ktraits::BlockReduceT(smem_reduce).Sum(dA_dBC_val);
+                    dA_val = complex_t(dA_dBC_val.x, dA_dBC_val.y);
+                    dBC_val = complex_t(dA_dBC_val.z, dA_dBC_val.w);
+                    if (threadIdx.x == 0) {
+                        smem_dbc[state_idx] = chunk == params.n_chunks - 1 ? dBC_val : dBC_val + smem_dbc[state_idx];
+                    }
+                } else {
+                    dA_val = typename Ktraits::BlockReduceComplexT(smem_reduce_complex).Sum(dA_val);
+                }
+                if (threadIdx.x == 0) {
+                    smem_da[state_idx] = chunk == params.n_chunks - 1 ? dA_val : dA_val + smem_da[state_idx];
+                }
+            }
+        }
+
+        if constexpr (kDeltaSoftplus) {
+            __syncthreads();
+            input_t delta_vals_load[kNItems];
+            load_input<Ktraits>(delta, delta_vals_load, smem_load, params.seqlen - chunk * kChunkSize);
+            delta -= kChunkSize;
+            #pragma unroll
+            for (int i = 0; i < kNItems; ++i) {
+                float delta_val = float(delta_vals_load[i]) + delta_bias;
+                float delta_val_neg_exp = expf(-delta_val);
+                ddelta_vals[i] = delta_val <= 20.f
+                    ? ddelta_vals[i] / (1.f + delta_val_neg_exp)
+                    : ddelta_vals[i];
+            }
+        }
+        for (int i = 0; i < kNItems; ++i) { ddelta_bias_val += ddelta_vals[i]; }
+
+        input_t *du = reinterpret_cast<input_t *>(params.du_ptr) + batch_id * params.du_batch_stride
+            + dim_id * params.du_d_stride + chunk * kChunkSize;
+        input_t *ddelta = reinterpret_cast<input_t *>(params.ddelta_ptr) + batch_id * params.ddelta_batch_stride
+            + dim_id * params.ddelta_d_stride + chunk * kChunkSize;
+        __syncthreads();
+        store_output<Ktraits>(du, du_vals, smem_store, params.seqlen - chunk * kChunkSize);
+        __syncthreads();
+        store_output<Ktraits>(ddelta, ddelta_vals, smem_store, params.seqlen - chunk * kChunkSize);
+
+        Bvar -= kChunkSize * (!kIsComplex ? 1 : 2);
+        Cvar -= kChunkSize * (!kIsComplex ? 1 : 2);
+    }
+    if (params.dD_ptr != nullptr) {
+        dD_val = typename Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dD_val);
+        if (threadIdx.x == 0) { gpuAtomicAdd(dD, dD_val); }
+    }
+    if (params.ddelta_bias_ptr != nullptr) {
+        __syncthreads();
+        ddelta_bias_val = typename Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(ddelta_bias_val);
+        if (threadIdx.x == 0) { gpuAtomicAdd(ddelta_bias, ddelta_bias_val); }
+    }
+    for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
+        gpuAtomicAdd(&(dA[state_idx * params.dA_dstate_stride]), smem_da[state_idx]);
+        weight_t dBC_val;
+        if (!kIsVariableB || !kIsVariableC) { dBC_val = smem_dbc[state_idx]; }
+        if constexpr (!kIsVariableB) {
+            gpuAtomicAdd(&(dB[state_idx * params.dB_dstate_stride]),
+                         !kIsVariableC ? dBC_val * conj(C[state_idx * params.C_dstate_stride]) : dBC_val);
+        }
+        if constexpr (!kIsVariableC) {
+            gpuAtomicAdd(&(dC[state_idx * params.dC_dstate_stride]),
+                        !kIsVariableB ? dBC_val * conj(B[state_idx * params.B_dstate_stride]) : dBC_val);
+        }
+    }
+}
+
+template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+void selective_scan_bwd_launch(SSMParamsBwd &params, cudaStream_t stream) {
+    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
+        BOOL_SWITCH(params.is_variable_B, kIsVariableB, [&] {
+            BOOL_SWITCH(params.is_variable_C, kIsVariableC, [&] {
+                BOOL_SWITCH(params.delta_softplus, kDeltaSoftplus, [&] {
+                    BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
+                        using Ktraits = Selective_Scan_bwd_kernel_traits<kNThreads, kNItems, kIsEvenLen, kIsVariableB, kIsVariableC, kDeltaSoftplus, kHasZ, input_t, weight_t>;
+                        // using Ktraits = Selective_Scan_bwd_kernel_traits<kNThreads, kNItems, true, kIsVariableB, kIsVariableC, kDeltaSoftplus, kHasZ, input_t, weight_t>;
+                        // TODO: check this
+                        constexpr int kSmemSize = Ktraits::kSmemSize + MAX_DSTATE * sizeof(typename Ktraits::scan_t) + (kNThreads + 4 * MAX_DSTATE) * sizeof(typename Ktraits::weight_t);
+
+                        dim3 grid(params.batch, params.dim);
+                        
+                        auto kernel = &selective_scan_bwd_kernel<Ktraits>;
+
+                        if (kSmemSize >= 48 * 1024) {
+
+                            #ifndef USE_ROCM
+                            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                            #else
+                            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                                (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                            std::cerr << "Warning (selective_scan_bwd_kernel): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
+                            #endif
+
+                        }
+
+                        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+                        C10_CUDA_KERNEL_LAUNCH_CHECK();
+                    });
+                });
+            });
+        });
+    });
+}
+
+template<typename input_t, typename weight_t>
+void selective_scan_bwd_cuda(SSMParamsBwd &params, cudaStream_t stream) {
+
+    #ifndef USE_ROCM
+        if (params.seqlen <= 128) {
+            selective_scan_bwd_launch<32, 4, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 256) {
+            selective_scan_bwd_launch<32, 8, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 512) {
+            selective_scan_bwd_launch<32, 16, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 1024) {
+            selective_scan_bwd_launch<64, 16, input_t, weight_t>(params, stream);
+        } else {
+            selective_scan_bwd_launch<128, 16, input_t, weight_t>(params, stream);
+        }
+    #else 
+        if (params.seqlen <= 256) {
+            selective_scan_bwd_launch<64, 4, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 512) {
+            selective_scan_bwd_launch<64, 8, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 1024) {
+            selective_scan_bwd_launch<64, 16, input_t, weight_t>(params, stream);
+        } else {
+            selective_scan_bwd_launch<128, 16, input_t, weight_t>(params, stream);
+        }
+    #endif
+}
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_common.h
+++ b/csrc/selective_scan/selective_scan_common.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#ifndef USE_ROCM
+    #include <cuda_bf16.h>
+#else
+    #include <hip/hip_bf16.h>
+#endif
+#include <cuda_fp16.h>
+#include <c10/util/complex.h>  // For scalar_value_type
+
+
+#ifndef USE_ROCM
+
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
+    {
+        return std::max(ilist);
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return std::min(a, b);
+    }
+
+#else
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
+    {
+        return *std::max_element(ilist.begin(), ilist.end());
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return a < b ? a : b;
+    }
+#endif
+
+
+#define MAX_DSTATE 256
+
+using complex_t = c10::complex<float>;
+
+inline __device__ float2 operator+(const float2 & a, const float2 & b){
+    return {a.x + b.x, a.y + b.y};
+}
+
+inline __device__ float3 operator+(const float3 &a, const float3 &b) {
+  return {a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+inline __device__ float4 operator+(const float4 & a, const float4 & b){
+    return {a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int BYTES> struct BytesToType {};
+
+template<> struct BytesToType<16> {
+    using Type = uint4;
+    static_assert(sizeof(Type) == 16);
+};
+
+template<> struct BytesToType<8> {
+    using Type = uint64_t;
+    static_assert(sizeof(Type) == 8);
+};
+
+template<> struct BytesToType<4> {
+    using Type = uint32_t;
+    static_assert(sizeof(Type) == 4);
+};
+
+template<> struct BytesToType<2> {
+    using Type = uint16_t;
+    static_assert(sizeof(Type) == 2);
+};
+
+template<> struct BytesToType<1> {
+    using Type = uint8_t;
+    static_assert(sizeof(Type) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename scalar_t, int N>
+struct Converter{
+    static inline __device__ void to_float(const scalar_t (&src)[N], float (&dst)[N]) {
+        #pragma unroll
+        for (int i = 0; i < N; ++i) { dst[i] = src[i]; }
+    }
+};
+
+template<int N>
+struct Converter<at::Half, N>{
+    static inline __device__ void to_float(const at::Half (&src)[N], float (&dst)[N]) {
+        static_assert(N % 2 == 0);
+        auto &src2 = reinterpret_cast<const half2 (&)[N / 2]>(src);
+        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
+        #pragma unroll
+        for (int i = 0; i < N / 2; ++i) { dst2[i] = __half22float2(src2[i]); }
+    }
+};
+
+#if __CUDA_ARCH__ >= 800
+template<int N>
+struct Converter<at::BFloat16, N>{
+    static inline __device__ void to_float(const at::BFloat16 (&src)[N], float (&dst)[N]) {
+        static_assert(N % 2 == 0);
+        auto &src2 = reinterpret_cast<const nv_bfloat162 (&)[N / 2]>(src);
+        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
+        #pragma unroll
+        for (int i = 0; i < N / 2; ++i) { dst2[i] = __bfloat1622float2(src2[i]); }
+    }
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// From https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp
+// and https://forums.developer.nvidia.com/t/complex-number-exponential-function/24696
+__device__ __forceinline__ complex_t cexp2f(complex_t z) {
+    float t = exp2f(z.real_);
+    float c, s;
+    sincosf(z.imag_, &s, &c);
+    return complex_t(c * t, s * t);
+}
+
+__device__ __forceinline__ complex_t cexpf(complex_t z) {
+    float t = expf(z.real_);
+    float c, s;
+    sincosf(z.imag_, &s, &c);
+    return complex_t(c * t, s * t);
+}
+
+template<typename scalar_t> struct SSMScanOp;
+
+template<>
+struct SSMScanOp<float> {
+    __device__ __forceinline__ float2 operator()(const float2 &ab0, const float2 &ab1) const {
+        return make_float2(ab1.x * ab0.x, ab1.x * ab0.y + ab1.y);
+    }
+};
+
+template<>
+struct SSMScanOp<complex_t> {
+    __device__ __forceinline__ float4 operator()(const float4 &ab0, const float4 &ab1) const {
+        complex_t a0 = complex_t(ab0.x, ab0.y);
+        complex_t b0 = complex_t(ab0.z, ab0.w);
+        complex_t a1 = complex_t(ab1.x, ab1.y);
+        complex_t b1 = complex_t(ab1.z, ab1.w);
+        complex_t out_a = a1 * a0;
+        complex_t out_b = a1 * b0 + b1;
+        return make_float4(out_a.real_, out_a.imag_, out_b.real_, out_b.imag_);
+    }
+};
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+template <typename scalar_t> struct SSMScanPrefixCallbackOp {
+    using scan_t = std::conditional_t<std::is_same_v<scalar_t, float>, float2, float4>;
+    scan_t running_prefix;
+    // Constructor
+    __device__ SSMScanPrefixCallbackOp(scan_t running_prefix_) : running_prefix(running_prefix_) {}
+    // Callback operator to be entered by the first warp of threads in the block.
+    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    __device__ scan_t operator()(scan_t block_aggregate) {
+        scan_t old_prefix = running_prefix;
+        running_prefix = SSMScanOp<scalar_t>()(running_prefix, block_aggregate);
+        return old_prefix;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Ktraits>
+inline __device__ void load_input(typename Ktraits::input_t *u,
+                                  typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
+                                  typename Ktraits::BlockLoadT::TempStorage &smem_load,
+                                  int seqlen) {
+    if constexpr (Ktraits::kIsEvenLen) {
+        auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
+        using vec_t = typename Ktraits::vec_t;
+        typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
+            reinterpret_cast<vec_t*>(u),
+            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(u_vals)
+            #ifdef USE_ROCM
+                , Ktraits::kNThreads * Ktraits::kNLoads
+            #endif
+            
+       );
+    } else {
+        typename Ktraits::BlockLoadT(smem_load).Load(u, u_vals, seqlen, 0.f);
+    }
+}
+
+template<typename Ktraits>
+inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
+                                   typename Ktraits::weight_t (&B_vals)[Ktraits::kNItems],
+                                   typename Ktraits::BlockLoadWeightT::TempStorage &smem_load_weight,
+                                   int seqlen) {
+    constexpr int kNItems = Ktraits::kNItems;
+    if constexpr (!Ktraits::kIsComplex) {
+        typename Ktraits::input_t B_vals_load[kNItems];
+        if constexpr (Ktraits::kIsEvenLen) {
+            auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
+            using vec_t = typename Ktraits::vec_t;
+            typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
+                reinterpret_cast<vec_t*>(Bvar),
+                reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(B_vals_load)
+          );
+        } else {
+            typename Ktraits::BlockLoadWeightT(smem_load_weight).Load(Bvar, B_vals_load, seqlen, 0.f);
+        }
+        // #pragma unroll
+        // for (int i = 0; i < kNItems; ++i) { B_vals[i] = B_vals_load[i]; }
+        Converter<typename Ktraits::input_t, kNItems>::to_float(B_vals_load, B_vals);
+    } else {
+        typename Ktraits::input_t B_vals_load[kNItems * 2];
+        if constexpr (Ktraits::kIsEvenLen) {
+            auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
+            using vec_t = typename Ktraits::vec_t;
+            typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
+                reinterpret_cast<vec_t*>(Bvar),
+                reinterpret_cast<vec_t(&)[Ktraits::kNLoads * 2]>(B_vals_load)
+          );
+        } else {
+            typename Ktraits::BlockLoadWeightT(smem_load_weight).Load(Bvar, B_vals_load, seqlen, 0.f);
+        }
+        #pragma unroll
+        for (int i = 0; i < kNItems; ++i) { B_vals[i] = complex_t(B_vals_load[i * 2], B_vals_load[i * 2 + 1]); }
+    }
+}
+
+template<typename Ktraits>
+inline __device__ void store_output(typename Ktraits::input_t *out,
+                                    const float (&out_vals)[Ktraits::kNItems],
+                                    typename Ktraits::BlockStoreT::TempStorage &smem_store,
+                                    int seqlen) {
+    typename Ktraits::input_t write_vals[Ktraits::kNItems];
+    #pragma unroll
+    for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
+    if constexpr (Ktraits::kIsEvenLen) {
+        auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
+        using vec_t = typename Ktraits::vec_t;
+        typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
+            reinterpret_cast<vec_t*>(out),
+            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(write_vals)
+       );
+    } else {
+        typename Ktraits::BlockStoreT(smem_store).Store(out, write_vals, seqlen);
+    }
+}
--- a/csrc/selective_scan/selective_scan_fwd_bf16.cu
+++ b/csrc/selective_scan/selective_scan_fwd_bf16.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_fwd_kernel.cuh"
+
+template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, complex_t>(SSMParamsBase &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_fwd_fp16.cu
+++ b/csrc/selective_scan/selective_scan_fwd_fp16.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_fwd_kernel.cuh"
+
+template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, complex_t>(SSMParamsBase &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_fwd_fp32.cu
+++ b/csrc/selective_scan/selective_scan_fwd_fp32.cu
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+// Split into multiple files to compile in paralell
+
+#include "selective_scan_fwd_kernel.cuh"
+
+template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, complex_t>(SSMParamsBase &params, cudaStream_t stream);
\ No newline at end of file
--- a/csrc/selective_scan/selective_scan_fwd_kernel.cuh
+++ b/csrc/selective_scan/selective_scan_fwd_kernel.cuh
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+
+#ifndef USE_ROCM
+    #include <cub/block/block_load.cuh>
+    #include <cub/block/block_store.cuh>
+    #include <cub/block/block_scan.cuh>
+#else
+    #include <hipcub/hipcub.hpp>
+    namespace cub = hipcub;
+#endif
+
+#include "selective_scan.h"
+#include "selective_scan_common.h"
+#include "static_switch.h"
+
+template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
+         bool kIsVariableB_, bool kIsVariableC_,
+         bool kHasZ_, typename input_t_, typename weight_t_>
+struct Selective_Scan_fwd_kernel_traits {
+    static_assert(kNItems_ % 4 == 0);
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
+    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
+    static constexpr int kNItems = kNItems_;
+    static constexpr int kNRows = kNRows_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
+    static_assert(kNItems % kNElts == 0);
+    static constexpr int kNLoads = kNItems / kNElts;
+    static constexpr bool kIsComplex = std::is_same_v<weight_t, complex_t>;
+    static constexpr bool kIsEvenLen = kIsEvenLen_;
+    static constexpr bool kIsVariableB = kIsVariableB_;
+    static constexpr bool kIsVariableC = kIsVariableC_;
+    static constexpr bool kHasZ = kHasZ_;
+
+    static constexpr bool kDirectIO = kIsEvenLen && kNLoads == 1;
+
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using scan_t = std::conditional_t<!kIsComplex, float2, float4>;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
+        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
+    using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, !kIsComplex ? kNItems : kNItems * 2, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, !kIsComplex ? kNLoads : kNLoads * 2,
+        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE  : cub::BLOCK_LOAD_DIRECT>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, kNLoads,
+        !kDirectIO ? cub::BLOCK_STORE_WARP_TRANSPOSE : cub::BLOCK_STORE_DIRECT>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING_MEMOIZE>;
+    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING>;
+    using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
+    static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
+                                                 sizeof(typename BlockLoadVecT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
+                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
+                                                 sizeof(typename BlockStoreT::TempStorage),
+                                                 sizeof(typename BlockStoreVecT::TempStorage)});
+    static constexpr int kSmemSize = kSmemIOSize + sizeof(typename BlockScanT::TempStorage);
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads, Ktraits::kMinBlocks)
+void selective_scan_fwd_kernel(SSMParamsBase params) {
+    constexpr bool kIsComplex = Ktraits::kIsComplex;
+    constexpr bool kIsVariableB = Ktraits::kIsVariableB;
+    constexpr bool kIsVariableC = Ktraits::kIsVariableC;
+    constexpr bool kHasZ = Ktraits::kHasZ;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNItems = Ktraits::kNItems;
+    constexpr int kNRows = Ktraits::kNRows;
+    constexpr bool kDirectIO = Ktraits::kDirectIO;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+    using scan_t = typename Ktraits::scan_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    // cast to lvalue reference of expected type
+    // char *smem_loadstorescan = smem_ + 2 * MAX_DSTATE * sizeof(weight_t);
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_ + 2 * MAX_DSTATE * sizeof(weight_t));
+    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
+    auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
+    // weight_t *smem_a = reinterpret_cast<weight_t *>(smem_ + smem_loadstorescan_size);
+    // weight_t *smem_bc = reinterpret_cast<weight_t *>(smem_a + MAX_DSTATE);
+    scan_t *smem_running_prefix = reinterpret_cast<scan_t *>(smem_ + Ktraits::kSmemSize);
+
+    const int batch_id = blockIdx.x;
+    const int dim_id = blockIdx.y;
+    const int group_id = dim_id / (params.dim_ngroups_ratio);
+    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride
+        + dim_id * kNRows * params.u_d_stride;
+    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride
+        + dim_id * kNRows * params.delta_d_stride;
+    weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
+    weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
+    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride;
+    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
+    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
+    scan_t *x = reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id * kNRows) * params.n_chunks * params.dstate;
+
+    float D_val[kNRows] = {0};
+    if (params.D_ptr != nullptr) {
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            D_val[r] = reinterpret_cast<float *>(params.D_ptr)[dim_id * kNRows + r];
+        }
+    }
+    float delta_bias[kNRows] = {0};
+    if (params.delta_bias_ptr != nullptr) {
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            delta_bias[r] = reinterpret_cast<float *>(params.delta_bias_ptr)[dim_id * kNRows + r];
+        }
+    }
+
+    // for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
+    //     smem_a[state_idx] = A[state_idx * params.A_dstate_stride];
+    //     smem_bc[state_idx] = B[state_idx * params.B_dstate_stride] * C[state_idx * params.C_dstate_stride];
+    // }
+
+    constexpr int kChunkSize = kNThreads * kNItems;
+    for (int chunk = 0; chunk < params.n_chunks; ++chunk) {
+        input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
+        __syncthreads();
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            if constexpr (!kDirectIO) {
+                if (r > 0) { __syncthreads(); }
+            }
+            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, params.seqlen - chunk * kChunkSize);
+            if constexpr (!kDirectIO) { __syncthreads(); }
+            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, params.seqlen - chunk * kChunkSize);
+        }
+        u += kChunkSize;
+        delta += kChunkSize;
+    
+        float delta_vals[kNRows][kNItems], delta_u_vals[kNRows][kNItems], out_vals[kNRows][kNItems];
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            #pragma unroll
+            for (int i = 0; i < kNItems; ++i) {
+                float u_val = float(u_vals[r][i]);
+                delta_vals[r][i] = float(delta_vals_load[r][i]) + delta_bias[r];
+                if (params.delta_softplus) {
+                    delta_vals[r][i] = delta_vals[r][i] <= 20.f ? log1pf(expf(delta_vals[r][i])) : delta_vals[r][i];
+                }
+                delta_u_vals[r][i] = delta_vals[r][i] * u_val;
+                out_vals[r][i] = D_val[r] * u_val;
+            }
+        }
+
+        __syncthreads();
+        for (int state_idx = 0; state_idx < params.dstate; ++state_idx) {
+            weight_t A_val[kNRows];
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                A_val[r] = A[state_idx * params.A_dstate_stride + r * params.A_d_stride];
+                // Multiply the real part of A with LOG2E so we can use exp2f instead of expf.
+                constexpr float kLog2e = M_LOG2E;
+                if constexpr (!kIsComplex) {
+                    A_val[r] *= kLog2e;
+                } else {
+                    A_val[r].real_ *= kLog2e;
+                }
+            }
+            // This variable holds B * C if both B and C are constant across seqlen. If only B varies
+            // across seqlen, this holds C. If only C varies across seqlen, this holds B.
+            // If both B and C vary, this is unused.
+            weight_t BC_val[kNRows];
+            weight_t B_vals[kNItems], C_vals[kNItems];
+            if constexpr (kIsVariableB) {
+                load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
+                    smem_load_weight, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+                if constexpr (!kIsVariableC) {
+                    #pragma unroll
+                    for (int r = 0; r < kNRows; ++r) {
+                        BC_val[r] = C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
+                    }
+                }
+            }
+            if constexpr (kIsVariableC) {
+                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
+                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
+                    smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (!kIsComplex ? 1 : 2));
+                if constexpr (!kIsVariableB) {
+                    #pragma unroll
+                    for (int r = 0; r < kNRows; ++r) {
+                        BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride];
+                    }
+                }
+            }
+            if constexpr (!kIsVariableB && !kIsVariableC) {
+                #pragma unroll
+                for (int r = 0; r < kNRows; ++r) {
+                    BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride] * C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
+                }
+            }
+
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                if (r > 0) { __syncthreads(); }  // Scan could be using the same smem
+                scan_t thread_data[kNItems];
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    if constexpr (!kIsComplex) {
+                        thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
+                                                     !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
+                        if constexpr (!Ktraits::kIsEvenLen) {  // So that the last state is correct
+                            if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
+                                thread_data[i] = make_float2(1.f, 0.f);
+                            }
+                        }
+                    } else {
+                        // Pytorch's implementation of complex exp (which calls thrust) is very slow
+                        complex_t delta_a_exp = cexp2f(delta_vals[r][i] * A_val[r]);
+                        weight_t B_delta_u_val = !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i];
+                        thread_data[i] = make_float4(delta_a_exp.real_, delta_a_exp.imag_, B_delta_u_val.real_, B_delta_u_val.imag_);
+                        if constexpr (!Ktraits::kIsEvenLen) {  // So that the last state is correct
+                            if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
+                                thread_data[i] = make_float4(1.f, 0.f, 0.f, 0.f);
+                            }
+                        }
+                    }
+                }
+                // Initialize running total
+                scan_t running_prefix;
+                if constexpr (!kIsComplex) {
+                    // If we use WARP_SCAN then all lane 0 of all warps (not just thread 0) needs to read
+                    running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.f, 0.f);
+                    // running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float2(1.f, 0.f);
+                } else {
+                    running_prefix = chunk > 0 && threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float4(1.f, 0.f, 0.f, 0.f);
+                    // running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float4(1.f, 0.f, 0.f, 0.f);
+                }
+                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
+                typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
+                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
+                );
+                // There's a syncthreads in the scan op, so we don't need to sync here.
+                // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
+                if (threadIdx.x == 0) {
+                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
+                    x[(r * params.n_chunks + chunk) * params.dstate + state_idx] = prefix_op.running_prefix;
+                }
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    const weight_t C_val = !kIsVariableC
+                        ? BC_val[r]
+                        : (!kIsVariableB ? BC_val[r] * C_vals[i] : C_vals[i]);
+                    if constexpr (!kIsComplex) {
+                        out_vals[r][i] += thread_data[i].y * C_val;
+                    } else {
+                        out_vals[r][i] += (complex_t(thread_data[i].z, thread_data[i].w) * C_val).real_ * 2;
+                    }
+                }
+            }
+        }
+        
+        input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+            + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
+        __syncthreads();
+        #pragma unroll
+        for (int r = 0; r < kNRows; ++r) {
+            if constexpr (!kDirectIO) {
+                if (r > 0) { __syncthreads(); }
+            }
+            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
+        }
+
+        if constexpr (kHasZ) {
+            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride
+                + dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
+            input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride
+                + dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
+            #pragma unroll
+            for (int r = 0; r < kNRows; ++r) {
+                input_t z_vals[kNItems];
+                __syncthreads();
+                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, params.seqlen - chunk * kChunkSize);
+                #pragma unroll
+                for (int i = 0; i < kNItems; ++i) {
+                    float z_val = z_vals[i];
+                    out_vals[r][i] *= z_val / (1 + expf(-z_val));
+                }
+                __syncthreads();
+                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
+            }
+        }
+
+        Bvar += kChunkSize * (!kIsComplex ? 1 : 2);
+        Cvar += kChunkSize * (!kIsComplex ? 1 : 2);
+    }
+}
+
+template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
+    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
+    // processing 1 row.
+    constexpr int kNRows = 1;
+    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
+        BOOL_SWITCH(params.is_variable_B, kIsVariableB, [&] {
+            BOOL_SWITCH(params.is_variable_C, kIsVariableC, [&] {
+                BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
+                    using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, input_t, weight_t>;
+                    
+                    constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
+                    dim3 grid(params.batch, params.dim / kNRows);
+
+                    // Had to change this substantially since potentially the hip 
+                    // interface for setting kernel launch attributes is slightly different from 
+                    // cuda's. In particualar, it seems to expect a plain const void * pointer.
+
+                    auto kernel = &selective_scan_fwd_kernel<Ktraits>;
+
+                    
+                    if (kSmemSize >= 48 * 1024) {
+                        #ifndef USE_ROCM
+                        C10_CUDA_CHECK(cudaFuncSetAttribute(
+                            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                        #else
+                        C10_CUDA_CHECK(cudaFuncSetAttribute(
+                            (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                            std::cerr << "Warning (selective_scan_fwd_kernel): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
+                        #endif
+                    }
+
+                    kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+                    C10_CUDA_KERNEL_LAUNCH_CHECK();
+                });
+            });
+        });
+    });
+}
+
+template<typename input_t, typename weight_t>
+void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
+
+    #ifndef USE_ROCM
+        if (params.seqlen <= 128) {           
+            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 256) {
+            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 512) {
+            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+        } else {
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+        }
+    #else
+        if (params.seqlen <= 256) {
+            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 512) {
+            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
+        } else if (params.seqlen <= 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+        } else {
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+        }
+    #endif
+}
--- a/csrc/selective_scan/static_switch.h
+++ b/csrc/selective_scan/static_switch.h
+// Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
+    [&] {                                                                            \
+        if (COND) {                                                                  \
+            constexpr bool CONST_NAME = true;                                        \
+            return __VA_ARGS__();                                                    \
+        } else {                                                                     \
+            constexpr bool CONST_NAME = false;                                       \
+            return __VA_ARGS__();                                                    \
+        }                                                                            \
+    }()
--- a/csrc/selective_scan/uninitialized_copy.cuh
+++ b/csrc/selective_scan/uninitialized_copy.cuh
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#ifndef USE_ROCM
+    #include <cub/config.cuh>
+
+    #include <cuda/std/type_traits>
+#else
+    #include <hipcub/hipcub.hpp>
+    // Map ::cuda::std to the standard std namespace
+    namespace cuda {
+        namespace std = ::std;
+    }
+#endif
+
+
+namespace detail
+{
+
+#if defined(_NVHPC_CUDA)
+template <typename T, typename U>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  // NVBug 3384810
+  new (ptr) T(::cuda::std::forward<U>(val));
+}
+#else
+template <typename T,
+          typename U,
+          typename ::cuda::std::enable_if<
+            ::cuda::std::is_trivially_copyable<T>::value,
+            int
+          >::type = 0>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  *ptr = ::cuda::std::forward<U>(val);
+}
+
+template <typename T,
+         typename U,
+         typename ::cuda::std::enable_if<
+           !::cuda::std::is_trivially_copyable<T>::value,
+           int
+         >::type = 0>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  new (ptr) T(::cuda::std::forward<U>(val));
+}
+#endif
+
+} // namespace detail
--- a/evals/lm_harness_eval.py
+++ b/evals/lm_harness_eval.py
+import torch
+
+import transformers
+from transformers import AutoTokenizer
+
+from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
+
+from lm_eval.api.model import LM
+from lm_eval.models.huggingface import HFLM
+from lm_eval.api.registry import register_model
+from lm_eval.__main__ import cli_evaluate
+
+
+@register_model("mamba")
+class MambaEvalWrapper(HFLM):
+
+    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+
+    def __init__(self, pretrained="state-spaces/mamba-2.8b", max_length=2048, batch_size=None, device="cuda",
+                 dtype=torch.float16):
+        LM.__init__(self)
+        self._model = MambaLMHeadModel.from_pretrained(pretrained, device=device, dtype=dtype)
+        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        self.vocab_size = self.tokenizer.vocab_size
+        self._batch_size = int(batch_size) if batch_size is not None else 64
+        self._max_length = max_length
+        self._device = torch.device(device)
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        raise NotImplementedError()
+
+
+if __name__ == "__main__":
+    cli_evaluate()