init

bc2d5632 · root · bc2d5632 · bc2d5632 · bc2d5632 · bc2d5632
Commit bc2d5632 authored Jan 15, 2026 by root
20 changed files
--- a/3rdparty/.gitignore
+++ b/3rdparty/.gitignore
+clang*
+llvm*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# Learn a lot from the MLC - LLM Project
+# https://github.com/mlc-ai/mlc-llm/blob/main/CMakeLists.txt
+cmake_minimum_required(VERSION 3.26)
+project(TILE_LANG C CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.gitmodules" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+  find_package(Git QUIET)
+  if(Git_FOUND)
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      RESULT_VARIABLE TILELANG_GIT_SUBMODULE_RESULT
+    )
+    if(NOT TILELANG_GIT_SUBMODULE_RESULT EQUAL 0)
+      message(
+        FATAL_ERROR
+          "Failed to initialize git submodules. Please run "
+          "`git submodule update --init --recursive` and re-run CMake."
+      )
+    endif()
+  else()
+    message(
+      FATAL_ERROR
+        "Git is required to initialize TileLang submodules. "
+        "Please install git or fetch the submodules manually."
+    )
+  endif()
+endif()
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+  set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "C compiler launcher")
+  set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CXX compiler launcher")
+  set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CUDA compiler launcher")
+endif()
+# Configs
+set(USE_CUDA OFF)
+set(USE_ROCM OFF)
+set(USE_METAL OFF)
+set(PREBUILD_CYTHON ON)
+# Configs end
+include(cmake/load_tvm.cmake)
+if(EXISTS ${TVM_SOURCE}/cmake/config.cmake)
+  include(${TVM_SOURCE}/cmake/config.cmake)
+else()
+  message(FATAL_ERROR "Nor tvm provided or submodule checkout-ed.")
+endif()
+# Include directories for TileLang
+set(TILE_LANG_INCLUDES ${TVM_INCLUDES})
+# Collect source files
+file(GLOB TILE_LANG_SRCS
+  src/*.cc
+  src/layout/*.cc
+  src/transform/*.cc
+  src/op/*.cc
+  src/target/utils.cc
+  src/target/codegen_cpp.cc
+  src/target/rt_mod_cpp.cc
+  # webgpu doesn't have system dependency
+  src/target/codegen_webgpu.cc
+  # intrin_rule doesn't have system dependency
+  src/target/intrin_rule*.cc
+)
+# Backend-specific checks and configs
+if($ENV{USE_METAL})
+  set(USE_METAL ON)
+elseif(APPLE)
+  message(STATUS "Enable Metal support by default.")
+  set(USE_METAL ON)
+elseif($ENV{USE_ROCM})
+  set(USE_ROCM ON)
+else()
+  if($ENV{USE_CUDA})
+    set(USE_CUDA ON)
+  elseif(DEFINED ENV{USE_CUDA} AND NOT $ENV{USE_CUDA})
+    # Build CPU-only when we explicitly disable CUDA
+    set(USE_CUDA OFF)
+  else()
+    message(STATUS "Enable CUDA support by default.")
+    set(USE_CUDA ON)
+  endif()
+endif()
+if(USE_METAL)
+  file(GLOB TILE_LANG_METAL_SRCS
+    src/target/rt_mod_metal.cc
+  )
+  list(APPEND TILE_LANG_SRCS ${TILE_LANG_METAL_SRCS})
+  # FIXME: CIBW failed with backtrace, why???
+  set(TVM_FFI_USE_LIBBACKTRACE OFF)
+elseif(USE_ROCM)
+  set(CMAKE_HIP_STANDARD 17)
+  include(${TVM_SOURCE}/cmake/utils/FindROCM.cmake)
+  find_rocm($ENV{USE_ROCM})
+  add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1)
+  file(GLOB TILE_LANG_HIP_SRCS
+    src/target/codegen_hip.cc
+    src/target/rt_mod_hip.cc
+  )
+  list(APPEND TILE_LANG_SRCS ${TILE_LANG_HIP_SRCS})
+  list(APPEND TILE_LANG_INCLUDES ${ROCM_INCLUDE_DIRS})
+elseif(USE_CUDA)
+  set(CMAKE_CUDA_STANDARD 17)
+  find_package(CUDAToolkit REQUIRED)
+  set(CMAKE_CUDA_COMPILER "${CUDAToolkit_BIN_DIR}/nvcc")
+  add_compile_definitions("CUDA_MAJOR_VERSION=${CUDAToolkit_VERSION_MAJOR}")
+  # Set `USE_CUDA=/usr/local/cuda-x.y`
+  cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA)
+  file(GLOB TILE_LANG_CUDA_SRCS
+    src/runtime/*.cc
+    src/target/ptx.cc
+    src/target/codegen_cuda.cc
+    src/target/rt_mod_cuda.cc
+  )
+  list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})
+  list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
+endif()
+# Include tvm after configs have been populated
+add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
+# Resolve compile warnings in tvm
+add_compile_definitions(DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+add_library(tilelang_objs OBJECT ${TILE_LANG_SRCS})
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
+endif()
+target_include_directories(tilelang_objs PRIVATE ${TILE_LANG_INCLUDES})
+add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
+add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
+target_link_libraries(tilelang PUBLIC tvm_runtime)
+target_link_libraries(tilelang_module PUBLIC tvm)
+if(APPLE)
+  # FIXME: libtilelang should only link against tvm runtime
+  target_link_libraries(tilelang PUBLIC tvm)
+endif()
+# Build cython extension
+find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
+add_custom_command(
+  OUTPUT "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp"
+  COMMENT
+    "Cythoning tilelang/jit/adapter/cython/cython_wrapper.pyx"
+  COMMAND Python::Interpreter -m cython
+          "${CMAKE_CURRENT_SOURCE_DIR}/tilelang/jit/adapter/cython/cython_wrapper.pyx"
+          --module-name tilelang_cython_wrapper
+          --cplus --output-file "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp"
+  DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/tilelang/jit/adapter/cython/cython_wrapper.pyx"
+  VERBATIM)
+if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
+  set(USE_SABI USE_SABI ${SKBUILD_SABI_VERSION})
+endif()
+python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
+# Install extension into the tilelang package directory
+install(TARGETS tilelang_cython_wrapper
+        LIBRARY DESTINATION tilelang
+        RUNTIME DESTINATION tilelang
+        ARCHIVE DESTINATION tilelang)
+# let libtilelang to search tvm/tvm_runtime in same dir
+if(APPLE)
+  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "@loader_path")
+  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "@loader_path")
+else()
+  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "\$ORIGIN")
+  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "\$ORIGIN")
+endif()
+install(TARGETS tvm tvm_runtime tilelang_module tilelang LIBRARY DESTINATION tilelang/lib)
+# Copy tvm cython ext for wheels
+# TODO: not necessary for editable builds
+if(TVM_BUILD_FROM_SOURCE)
+  add_dependencies(tilelang tvm_cython)
+  install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tvm/python/tvm/ffi/core.abi3.so" DESTINATION tilelang/3rdparty/tvm/python/tvm/ffi/)
+endif()
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socioeconomic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+[leiwang1999@outlook.com](mailto:leiwang1999@outlook.com)
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series of
+actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+# Contributing
+That would be awesome if you want to contribute something to TileLang!
+### Table of Contents  <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
+- [Report Bugs](#report-bugs)
+- [Ask Questions](#ask-questions)
+- [Submit Pull Requests](#submit-pull-requests)
+- [Setup Development Environment](#setup-development-environment)
+- [Install Develop Version](#install-develop-version)
+- [Lint Check](#lint-check)
+- [Test Locally](#test-locally)
+- [Build Wheels](#build-wheels)
+- [Documentation](#documentation)
+## Report Bugs
+If you run into any weird behavior while using TileLang, feel free to open a new issue in this repository! Please run a **search before opening** a new issue, to make sure that someone else hasn't already reported or solved the bug you've found.
+Any issue you open must include:
+- Code snippet that reproduces the bug with a minimal setup.
+- A clear explanation of what the issue is.
+## Ask Questions
+Please ask questions in issues.
+## Submit Pull Requests
+All pull requests are super welcomed and greatly appreciated! Issues in need of a solution are marked with a [`♥ help`](https://github.com/ianstormtaylor/TileLang/issues?q=is%3Aissue+is%3Aopen+label%3A%22%E2%99%A5+help%22) label if you're looking for somewhere to start.
+If you're new to contributing to TileLang, you can follow the following guidelines before submitting a pull request.
+> [!NOTE]
+> Please include tests and docs with every pull request if applicable!
+## Setup Development Environment
+Before contributing to TileLang, please follow the instructions below to setup.
+1. Fork TileLang ([fork](https://github.com/tile-ai/tilelang/fork)) on GitHub and clone the repository.
+    ```bash
+    git clone --recurse-submodules git@github.com:<your username>/tilelang.git  # use the SSH protocol
+    cd tilelang
+    git remote add upstream git@github.com:tile-ai/tilelang.git
+    ```
+2. Setup a development environment:
+    ```bash
+    uv venv --seed .venv  # use `python3 -m venv .venv` if you don't have `uv`
+    source .venv/bin/activate
+    python3 -m pip install --upgrade pip setuptools wheel "build[uv]"
+    uv pip install --requirements requirements-dev.txt
+    ```
+3. Setup the [`pre-commit`](https://pre-commit.com) hooks:
+    ```bash
+    pre-commit install --install-hooks
+    ```
+Then you are ready to rock. Thanks for contributing to TileLang!
+## Install Develop Version
+To install TileLang in an "editable" mode, run:
+```bash
+python3 -m pip install --no-build-isolation --verbose --editable .
+```
+in the main directory. This installation is removable by:
+```bash
+python3 -m pip uninstall tilelang
+```
+## Lint Check
+To check the linting, run:
+```bash
+pre-commit run --all-files
+```
+## Test Locally
+To run the tests, start by building the project as described in the [Setup Development Environment](#setup-development-environment) section.
+Then you can rerun the tests with:
+```bash
+python3 -m pytest testing
+```
+## Build Wheels
+_TBA_
+## Documentation
+_TBA_
--- a/LICENSE
+++ b/LICENSE
+    MIT License
+    Copyright (c) Tile-AI.
+    **During the period from December 1, 2024, to Mar 14, 2025, this project is 
+    subject to additional collaboration terms with Microsoft Corporation.**
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
--- a/MANIFEST.in
+++ b/MANIFEST.in
+# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
+# Include licenses
+include VERSION
+include LICENSE
+include THIRDPARTYNOTICES.txt
+# Version and dependency files
+include version_provider.py
+include requirements*.txt
+include tilelang/jit/adapter/cython/cython_wrapper.pyx
+# Include source files in SDist
+include CMakeLists.txt
+graft src
+graft cmake
+graft 3rdparty
+# Include test suites in SDist
+graft testing
+graft examples
+global-exclude .coverage .coverage.* coverage.xml coverage-*.xml coverage.*.xml
+global-exclude .junit .junit.* junit.xml junit-*.xml junit.*.xml
+# Exclude unneeded files and directories
+prune .git
+prune .github
+prune */.git
+prune */.github
+prune 3rdparty/clang*
+prune 3rdparty/llvm*
+# Prune compiled files
+prune */__pycache__
+global-exclude *~ *.py[cod] *.so *.a *.dylib *.pxd *.dll *.lib *.o *.obj
--- a/README.md
+++ b/README.md
+<img src=./images/logo-row.svg />
+<div align="center">
+# Tile Language
+[![PyPI version](https://badge.fury.io/py/tilelang.svg)](https://badge.fury.io/py/tilelang)
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/tile-ai/tilelang) [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?logo=discord&logoColor=white)](https://discord.gg/TUrHyJnKPG)
+</div>
+Tile Language (**tile-lang**) is a concise domain-specific language designed to streamline the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention). By employing a Pythonic syntax with an underlying compiler infrastructure on top of [TVM](https://tvm.apache.org/), tile-lang allows developers to focus on productivity without sacrificing the low-level optimizations necessary for state-of-the-art performance.
+<img src=./images/MatmulExample.png />
+## Latest News
+- 10/07/2025 🍎: Added Apple Metal Device support, check out [Pull Request #799](https://github.com/tile-ai/tilelang/pull/799) for details.
+- 09/29/2025  🎉: Thrilled to announce that AscendC and AscendNPU IR backends targeting Huawei Ascend chips are now supported!
+Check out the preview here:
+🔗 [link](https://github.com/tile-ai/tilelang-ascend).
+This includes implementations across two branches:
+[ascendc_pto](https://github.com/tile-ai/tilelang-ascend) and
+[npuir](https://github.com/tile-ai/tilelang-ascend/tree/npuir).
+Feel free to explore and share your feedback! 
+- 07/04/2025 🚀: Introduced `T.gemm_sp` for 2:4 sparse tensor core support, check out [Pull Request #526](https://github.com/tile-ai/tilelang/pull/526) for details.
+- 06/05/2025 ✨: Added [NVRTC Backend](https://github.com/tile-ai/tilelang/pull/461) to significantly reduce compilation time for cute templates!
+- 04/14/2025 🚀: Added high-performance FlashMLA implementation for AMD MI300X, achieving performance parity with hand-optimized assembly kernels of Aiter! See [example_mla_amd](./examples/deepseek_mla/amd/README.md) for details.
+- 03/03/2025 🚀: Added high-performance MLA Decoding support using only 80 lines of Python code, achieving performance on par with FlashMLA on H100 (see [example_mla_decode.py](./examples/deepseek_mla/example_mla_decode.py))! We also provide [documentation](./examples/deepseek_mla/README.md) explaining how TileLang achieves this.
+- 02/15/2025 ✨: Added WebGPU Codegen support, see [Pull Request #86](https://github.com/tile-ai/tilelang/pull/86)!
+- 02/12/2025 ✨: Excited to announce the release of [v0.1.0](https://github.com/tile-ai/tilelang/releases/tag/v0.1.0)!
+- 02/10/2025 🚀: Added debug tools for TileLang—`T.print` for printing variables/buffers ([docs](https://tilelang.com/tutorials/debug_tools_for_tilelang.html)) and a memory layout plotter ([examples/plot_layout](./examples/plot_layout)).
+- 01/20/2025 ✨: We are excited to announce that tile-lang, a dsl for high performance AI workloads, is now open source and available to the public!
+## Tested Devices
+Although tile-lang aims to be portable across a range of Devices, it has been specifically tested and validated on the following devices: for NVIDIA GPUs, this includes the H100 (with Auto TMA/WGMMA support), A100, V100, RTX 4090, RTX 3090, and RTX A6000; for AMD GPUs, it includes the MI250 (with Auto MatrixCore support) and the MI300X (with Async Copy support).
+## OP Implementation Examples
+**tile-lang** provides the building blocks to implement a wide variety of operators. Some examples include:
+- [Matrix Multiplication](./examples/gemm/)
+- [Dequantization GEMM](./examples/dequantize_gemm/)
+- [Flash Attention](./examples/flash_attention/)
+- [Flash Linear Attention](./examples/linear_attention/)
+- [Flash MLA Decoding](./examples/deepseek_mla/)
+- [Native Sparse Attention](./examples/deepseek_nsa/)
+Within the `examples` directory, you will also find additional complex kernels—such as convolutions, forward/backward passes for FlashAttention, more operators will continuously be added.
+## Benchmark Summary
+TileLang achieves exceptional performance across a variety of computational patterns. Comprehensive benchmark scripts and settings are available at [tilelang-benchmark](https://github.com/tile-ai/tilelang-benchmark). Below are selected results showcasing its capabilities:
+- MLA Decoding Performance on H100
+  <div style="display: flex; gap: 10px; justify-content: center;">
+    <div style="flex: 1;">
+      <img src="./examples/deepseek_mla/figures/bs64_float16.png" alt="mla decode performance bs64 on H100" width="100%" />
+    </div>
+    <div style="flex: 1;">
+      <img src="./examples/deepseek_mla/figures/bs128_float16.png" alt="mla decode performance bs128 on H100" width="100%" />
+    </div>
+  </div>
+- Flash Attention Performance on H100
+  <div align="center">    <img src="./images/mha_performance_h100.png" alt="operator performance on H100" width=80% />
+  </div>
+- Matmul Performance on GPUs (RTX 4090, A100, H100, MI300X)
+  <div>
+    <img src="./images/op_benchmark_consistent_gemm_fp16.png" alt="gemm fp16 performance on Gpus" />
+  </div>
+- Dequantize Matmul Performance on A100
+  <div>
+    <img src="./images/op_benchmark_a100_wq_gemv.png" alt="dequantize gemv performance on A100" />
+  </div>
+## Installation
+### Method 1: Install with Pip
+The quickest way to get started is to install the latest release from PyPI:
+```bash
+pip install tilelang
+```
+Alternatively, you can install directly from the GitHub repository:
+```bash
+pip install git+https://github.com/tile-ai/tilelang
+```
+Or install locally:
+```bash
+# install required system dependencies
+sudo apt-get update
+sudo apt-get install -y python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+pip install -e . -v # remove -e option if you don't want to install in editable mode, -v for verbose output
+```
+### Method 2: Build from Source
+We currently provide three ways to install **tile-lang** from source:
+ - [Install from Source (using your own TVM installation)](./docs/get_started/Installation.md#method-1-install-from-source-using-your-own-tvm-installation)
+ - [Install from Source (using the bundled TVM submodule)](./docs/get_started/Installation.md#method-2-install-from-source-using-the-bundled-tvm-submodule)
+ - [Install Using the Provided Script](./docs/get_started/Installation.md#method-3-install-using-the-provided-script)
+### Method 3: Install with Nightly Version
+For users who want access to the latest features and improvements before official releases, we provide nightly builds of **tile-lang**.
+```bash
+pip install tilelang -f https://tile-ai.github.io/whl/nightly/cu121/
+# or pip install tilelang --find-links https://tile-ai.github.io/whl/nightly/cu121/
+```
+> **Note:** Nightly builds contain the most recent code changes but may be less stable than official releases. They're ideal for testing new features or if you need a specific bugfix that hasn't been released yet.
+## Quick Start
+In this section, you'll learn how to write and execute a straightforward GEMM (matrix multiplication) kernel using tile-lang, followed by techniques for layout optimizations, pipelining, and L2-cache–friendly swizzling.
+### GEMM Example with Annotations (Layout, L2 Cache Swizzling, and Pipelining, etc.)
+Below is an example that demonstrates more advanced features: layout annotation, parallelized copy, and swizzle for improved L2 cache locality. This snippet shows how to adapt your kernel to maximize performance on complex hardware.
+```python
+import tilelang
+import tilelang.language as T
+# @tilelang.jit(target="cuda")
+# target currently can be "cuda" or "hip" or "cpu".
+# if not specified, it will be inferred from the input tensors during compile time
+@tilelang.jit
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    @T.prim_func
+    def matmul_relu_kernel(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            # Enable rasterization for better L2 cache locality (Optional)
+            # T.use_swizzle(panel_size=10, enable=True)
+            # Clear local accumulation
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy tile of A
+                # This is a sugar syntax for parallelized copy
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                # Copy tile of B
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                # Perform a tile-level GEMM on the shared buffers
+                # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
+                T.gemm(A_shared, B_shared, C_local)
+            # relu
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+            # Copy result back to global memory
+            T.copy(C_local, C[by * block_M, bx * block_N])
+    return matmul_relu_kernel
+M = 1024  # M = T.dynamic("m") if you want to use dynamic shape
+N = 1024
+K = 1024
+block_M = 128
+block_N = 128
+block_K = 32
+# 1. Define the kernel (matmul) and compile/lower it into an executable module
+matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
+# 3. Test the kernel in Python with PyTorch data
+import torch
+# Create random input tensors on the GPU
+a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+c = torch.empty(M, N, device="cuda", dtype=torch.float16)
+# Run the kernel through the Profiler
+matmul_relu_kernel(a, b, c)
+print(c)
+# Reference multiplication using PyTorch
+ref_c = torch.relu(a @ b)
+# Validate correctness
+torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+print("Kernel output matches PyTorch reference.")
+# 4. Retrieve and inspect the generated CUDA source (optional)
+# cuda_source = jit_kernel.get_kernel_source()
+# print("Generated CUDA kernel:\n", cuda_source)
+# 5.Profile latency with kernel
+profiler = matmul_relu_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+latency = profiler.do_bench()
+print(f"Latency: {latency} ms")
+```
+### Dive Deep into TileLang Beyond GEMM
+In addition to GEMM, we provide a variety of examples to showcase the versatility and power of TileLang, including:
+- [Dequantize GEMM](./examples/dequantize_gemm/): Achieve high-performance dequantization by **fine-grained control over per-thread operations**, with many features now adopted as default behaviors in [BitBLAS](https://github.com/microsoft/BitBLAS), which utilizing magic layout transformation and intrins to accelerate dequantize gemm.
+- [FlashAttention](./examples/flash_attention/): Enable cross-operator fusion with simple and intuitive syntax, and we also provide an example of auto tuning.
+- [LinearAttention](./examples/linear_attention/): Examples include RetNet and Mamba implementations.
+- [Convolution](./examples/convolution/): Implementations of Convolution with IM2Col.
+## Upcoming Features
+Check our [tilelang v0.2.0 release plan](https://github.com/tile-ai/tilelang/issues/79) for upcoming features.
+---
+TileLang has now been used in project [BitBLAS](https://github.com/microsoft/BitBLAS) and [AttentionEngine](https://github.com/microsoft/AttentionEngine).
+## Join the Discussion
+Welcome to join our Discord community for discussions, support, and collaboration!
+[![Join our Discord](https://img.shields.io/badge/Discord-Join%20Us-blue?logo=discord&style=for-the-badge)](https://discord.gg/TUrHyJnKPG)
+## Acknowledgments
+We would like to express our gratitude to the [TVM](https://github.com/apache/tvm) community for their invaluable contributions. The initial version of this project was mainly developed by [LeiWang1999](https://github.com/LeiWang1999), [chengyupku](https://github.com/chengyupku) and [nox-410](https://github.com/nox-410) with supervision from Prof. [Zhi Yang](https://yangzhihome.github.io) at Peking University. Part of this work was carried out during an internship at Microsoft Research, where Dr. Lingxiao Ma, Dr. Yuqing Xia, Dr. Jilong Xue, and Dr. Fan Yang offered valuable advice and support. We deeply appreciate their mentorship and contributions.
--- a/THIRDPARTYNOTICES.txt
+++ b/THIRDPARTYNOTICES.txt
--- a/VERSION
+++ b/VERSION
+0.1.6.post1
--- a/benchmark/blocksparse_attention/benchmark_configs.py
+++ b/benchmark/blocksparse_attention/benchmark_configs.py
+# BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK
+configs = [[4, 2, 256, 64, 2, 64]]
--- a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
+# ruff: noqa
+import torch
+from tilelang.profiler import do_bench
+def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
+    bsz, num_head, downsample_len, _ = x.shape
+    # N_CTX = downsample_len * BLOCK
+    sparse_index = torch.topk(x, topk, dim=-1).indices
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
+    dense_mask.scatter_(-1, sparse_index, True)
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=False):
+    dense_mask = x > threshold
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+def benchmark_topk_sparse_attention():
+    from benchmark_configs import configs
+    torch.manual_seed(0)
+    # Config
+    for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
+        # Create inputs
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        import flash_attn
+        def benchmark_fn():
+            flash_attn.flash_attn_func(q, k, v, causal=True)
+        ref_latency = do_bench(
+            benchmark_fn,
+            warmup=10,
+            rep=100,
+        )
+        print(
+            f"BATCH: {BATCH}, N_HEADS: {N_HEADS}, SEQ_LEN: {SEQ_LEN}, D_HEAD: {D_HEAD}, TOPK: {TOPK}, BLOCK: {BLOCK}, ref_latency: {ref_latency}"
+        )
+if __name__ == "__main__":
+    benchmark_topk_sparse_attention()
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+# ruff: noqa
+import math
+import torch
+import tilelang
+from tilelang import language as T
+from tilelang.profiler import do_bench
+def is_hip():
+    return False
+def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
+    bsz, num_head, downsample_len, _ = x.shape
+    # N_CTX = downsample_len * BLOCK
+    sparse_index = torch.topk(x, topk, dim=-1).indices
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
+    dense_mask.scatter_(-1, sparse_index, True)
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=False):
+    dense_mask = x > threshold
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal):
+    block_M = 64
+    block_N = 64
+    num_stages = 2
+    threads = 128
+    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    shape = [batch, heads, seq_len, dim]
+    block_mask_shape = [batch, heads, downsample_len, downsample_len]
+    dtype = "float16"
+    accum_dtype = "float"
+    block_mask_dtype = "bool"
+    def kernel_func(block_M, block_N, num_stages, threads):
+        @T.macro
+        def MMA0(
+            K: T.Tensor(shape, dtype),
+            Q_shared: T.SharedBuffer([block_M, dim], dtype),
+            K_shared: T.SharedBuffer([block_N, dim], dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            k: T.int32,
+            bx: T.int32,
+            by: T.int32,
+            bz: T.int32,
+        ):
+            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            if is_causal:
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
+                                                 -T.infinity(acc_s.dtype))
+            else:
+                T.clear(acc_s)
+            T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+        @T.macro
+        def MMA1(
+            V: T.Tensor(shape, dtype),
+            V_shared: T.SharedBuffer([block_M, dim], dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            k: T.int32,
+            by: T.int32,
+            bz: T.int32,
+        ):
+            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+        @T.macro
+        def Softmax(
+                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+                scores_max: T.FragmentBuffer([block_M], accum_dtype),
+                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+                logsum: T.FragmentBuffer([block_M], accum_dtype),
+        ):
+            T.copy(scores_max, scores_max_prev)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+            # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+            # in the first ceil_div(kBlockM, kBlockN) steps.
+            # for i in T.Parallel(block_M):
+            #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+            for i in T.Parallel(block_M):
+                scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+            for i, j in T.Parallel(block_M, block_N):
+                # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                # max * log_2(e)) This allows the compiler to use the ffma
+                # instruction instead of fadd and fmul separately.
+                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+            T.reduce_sum(acc_s, scores_sum, dim=1)
+            for i in T.Parallel(block_M):
+                logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+            T.copy(acc_s, acc_s_cast)
+        @T.macro
+        def Rescale(
+                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        ):
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] *= scores_scale[i]
+        @T.prim_func
+        def main(
+                Q: T.Tensor(shape, dtype),
+                K: T.Tensor(shape, dtype),
+                V: T.Tensor(shape, dtype),
+                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+                Output: T.Tensor(shape, dtype),
+        ):
+            with T.Kernel(
+                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+                Q_shared = T.alloc_shared([block_M, dim], dtype)
+                K_shared = T.alloc_shared([block_N, dim], dtype)
+                V_shared = T.alloc_shared([block_N, dim], dtype)
+                O_shared = T.alloc_shared([block_M, dim], dtype)
+                acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+                acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+                acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+                scores_max = T.alloc_fragment([block_M], accum_dtype)
+                scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+                scores_scale = T.alloc_fragment([block_M], accum_dtype)
+                scores_sum = T.alloc_fragment([block_M], accum_dtype)
+                logsum = T.alloc_fragment([block_M], accum_dtype)
+                block_mask = T.alloc_local([downsample_len], block_mask_dtype)
+                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.fill(acc_o, 0)
+                T.fill(logsum, 0)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                for vj in T.serial(downsample_len):
+                    block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+                loop_range = (
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
+                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                for k in T.Pipelined(loop_range, num_stages=num_stages):
+                    if block_mask[k]:
+                        MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
+                                scores_sum, logsum)
+                        Rescale(acc_o, scores_scale)
+                        MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] /= logsum[i]
+                T.copy(acc_o, O_shared)
+                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+        return main
+    return kernel_func(block_M, block_N, num_stages, threads)
+def benchmark_topk_sparse_attention():
+    from benchmark_configs import configs
+    torch.manual_seed(0)
+    # Config
+    for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
+        # Create inputs
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        # Create sparse mask (downsampled to block level)
+        downsample_factor = BLOCK
+        downsample_len = math.ceil(SEQ_LEN / downsample_factor)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
+                           device='cuda',
+                           dtype=torch.bfloat16)
+        x_ds[:, :, :, 0] = 100
+        block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+        program = blocksparse_flashattn(
+            BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+        kernel = tilelang.compile(program, out_idx=4)
+        def benchmark_fn():
+            # Compute reference
+            # Expand block mask to full attention matrix
+            kernel(q, k, v, block_mask)
+        ref_latency = do_bench(
+            benchmark_fn,
+            warmup=10,
+            rep=100,
+        )
+        print(
+            f"BATCH: {BATCH}, N_HEADS: {N_HEADS}, SEQ_LEN: {SEQ_LEN}, D_HEAD: {D_HEAD}, TOPK: {TOPK}, BLOCK: {BLOCK}, ref_latency: {ref_latency}"
+        )
+if __name__ == "__main__":
+    benchmark_topk_sparse_attention()
--- a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
+# ruff: noqa
+import math
+import torch
+import torch.nn.functional as F
+from tilelang.profiler import do_bench
+def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
+    bsz, num_head, downsample_len, _ = x.shape
+    # N_CTX = downsample_len * BLOCK
+    sparse_index = torch.topk(x, topk, dim=-1).indices
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
+    dense_mask.scatter_(-1, sparse_index, True)
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=False):
+    dense_mask = x > threshold
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+def benchmark_topk_sparse_attention():
+    from benchmark_configs import configs
+    torch.manual_seed(0)
+    # Config
+    for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
+        # Create inputs
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        sm_scale = 1.0 / (D_HEAD**0.5)
+        # Create sparse mask (downsampled to block level)
+        downsample_factor = BLOCK
+        downsample_len = math.ceil(SEQ_LEN / downsample_factor)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
+                           device='cuda',
+                           dtype=torch.bfloat16)
+        x_ds[:, :, :, 0] = 100
+        block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+        def benchmark_fn():
+            # Compute reference
+            # Expand block mask to full attention matrix
+            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+            full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
+            full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
+            # PyTorch reference implementation
+            attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+            attn = attn.masked_fill(~full_mask, float('-inf'))
+            attn = F.softmax(attn, dim=-1)
+            ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+            return ref_output
+        ref_latency = do_bench(
+            benchmark_fn,
+            warmup=10,
+            rep=100,
+        )
+        print(
+            f"BATCH: {BATCH}, N_HEADS: {N_HEADS}, SEQ_LEN: {SEQ_LEN}, D_HEAD: {D_HEAD}, TOPK: {TOPK}, BLOCK: {BLOCK}, ref_latency: {ref_latency}"
+        )
+if __name__ == "__main__":
+    benchmark_topk_sparse_attention()
--- a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
+# ruff: noqa
+import math
+import torch
+import triton
+import triton.language as tl
+from tilelang.profiler import do_bench
+def is_hip():
+    return False
+def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
+    bsz, num_head, downsample_len, _ = x.shape
+    # N_CTX = downsample_len * BLOCK
+    sparse_index = torch.topk(x, topk, dim=-1).indices
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
+    dense_mask.scatter_(-1, sparse_index, True)
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=False):
+    dense_mask = x > threshold
+    if use_dense_for_last_block:
+        dense_mask[:, :, -2:, :] = True
+    dense_mask.tril_()
+    return dense_mask
+@triton.jit
+def _fwd_kernel_inner(
+    acc,
+    l_i,
+    m_i,
+    q,
+    k_block_col_idx,
+    block_mask_ptr,
+    k_ptrs,
+    v_ptrs,
+    offs_m,
+    offs_n,
+    stride_kt,
+    stride_vt,
+    stride_bmask_n,
+    sm_scale,
+    seqlen_k,
+    past_len,
+    LAST_K_BLOCK: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
+    if mask_val == True:
+        start_n = k_block_col_idx * BLOCK_N
+        # -- compute qk ----
+        k = tl.load(k_ptrs + start_n * stride_kt)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
+        if LAST_K_BLOCK:
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
+                           float('-inf'))
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))
+        qk -= m_ij[:, None]
+        p = tl.exp(qk)
+        l_ij = tl.sum(p, 1)
+        alpha = tl.exp(m_i - m_ij)
+        l_i = l_i * alpha + l_ij
+        acc = acc * alpha[:, None]
+        # update acc
+        v = tl.load(v_ptrs + start_n * stride_vt)
+        p = p.to(v.type.element_ty)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        m_i = m_ij
+    return acc, l_i, m_i
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    block_mask_ptr,
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qd,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kd,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_vd,
+    stride_bmz,
+    stride_bmh,
+    stride_bmm,
+    stride_bmn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_od,
+    H,
+    N_CTX,
+    PAST_LEN,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+):
+    Q_LEN = N_CTX - PAST_LEN
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_h = off_hz % H
+    off_z = off_hz // H
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    block_mask_ptr += off_z * stride_bmz + off_h * stride_bmh
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    off_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
+    # off_k = offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
+    off_k = offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kd
+    off_v = offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    mask_ptrs = block_mask_ptr + start_m * stride_bmm
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    q = tl.load(q_ptrs, mask=offs_m[:, None] < Q_LEN)
+    k_block_start = 0
+    k_block_end = tl.cdiv((start_m + 1) * BLOCK_M, BLOCK_N)
+    # loop over k, v and update accumulator
+    for col_idx in range(k_block_start, k_block_end):
+        acc, l_i, m_i = _fwd_kernel_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            col_idx,
+            mask_ptrs,
+            k_ptrs,
+            v_ptrs,
+            offs_m,
+            offs_n,
+            stride_kn,
+            stride_vn,
+            stride_bmn,
+            sm_scale,
+            N_CTX,
+            PAST_LEN,
+            col_idx == k_block_end - 1,
+            BLOCK_M,
+            BLOCK_N,
+        )
+    m_i += tl.math.log(l_i)
+    l_recip = 1 / l_i[:, None]
+    acc = acc * l_recip
+    acc = acc.to(Out.dtype.element_ty)
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
+        None, :] * stride_od
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
+def _forward(ctx,
+             q,
+             k,
+             v,
+             block_sparse_mask,
+             sm_scale,
+             BLOCK_M=64,
+             BLOCK_N=64,
+             num_warps=None,
+             num_stages=1,
+             out=None):
+    assert q.shape[-1] == k.shape[-1] == v.shape[-1]
+    assert k.shape[2] == v.shape[2]
+    o = out if out is not None else torch.empty_like(q).contiguous()
+    grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1])
+    assert q.shape[-1] in [64, 128]
+    BLOCK_DMODEL = q.shape[-1]
+    if is_hip():
+        num_warps, num_stages = 8, 1
+    else:
+        num_warps, num_stages = 4, 2
+    N_CTX = k.shape[2]
+    PAST_LEN = N_CTX - q.shape[2]
+    H = q.shape[1]
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        block_sparse_mask,
+        o,
+        *q.stride(),
+        *k.stride(),
+        *v.stride(),
+        *block_sparse_mask.stride(),
+        *o.stride(),
+        H,
+        N_CTX,
+        PAST_LEN,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_DMODEL,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return o
+class _sparse_attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
+        # shape constraints
+        return _forward(ctx, q, k, v, block_sparse_dense, sm_scale)
+    @staticmethod
+    def backward(ctx, do):
+        # No gradient propagation.
+        raise NotImplementedError("It does not support gradient propagation yet")
+        return None, None, None, None, None
+block_sparse_triton_fn = _sparse_attention.apply
+def benchmark_topk_sparse_attention():
+    from benchmark_configs import configs
+    torch.manual_seed(0)
+    # Config
+    for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
+        # Create inputs
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        sm_scale = 1.0 / (D_HEAD**0.5)
+        # Create sparse mask (downsampled to block level)
+        downsample_factor = BLOCK
+        downsample_len = math.ceil(SEQ_LEN / downsample_factor)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
+                           device='cuda',
+                           dtype=torch.bfloat16)
+        x_ds[:, :, :, 0] = 100
+        block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+        def benchmark_fn():
+            # Compute reference
+            # Expand block mask to full attention matrix
+            block_sparse_triton_fn(q, k, v, block_mask, sm_scale)  # noqa: B023
+        ref_latency = do_bench(
+            benchmark_fn,
+            warmup=10,
+            rep=100,
+        )
+        print(
+            f"BATCH: {BATCH}, N_HEADS: {N_HEADS}, SEQ_LEN: {SEQ_LEN}, D_HEAD: {D_HEAD}, TOPK: {TOPK}, BLOCK: {BLOCK}, ref_latency: {ref_latency}"
+        )
+if __name__ == "__main__":
+    benchmark_topk_sparse_attention()
--- a/benchmark/blocksparse_attention/requirements.txt
+++ b/benchmark/blocksparse_attention/requirements.txt
+flash-attn
--- a/benchmark/mamba2/README.md
+++ b/benchmark/mamba2/README.md
+# Mamba2_chunk_scan Benchmark
+This document records the throughput achieved by `benchmark_mamba_chunk_scan.py` when computing `batch = 8`, `heads = 80`, `groups = 1`, `chunk_size = 256`, `dim = 64`, and `dstate = 128` across different `seq_len` using the default autotuning search space.
+## Environment
+- Repository commit: `8a5eb569704bfea64478c29adcfe3a09e3c2b12c`
+- GPUs: `NVIDIA H800 SXM` on driver `560.35.05`
+## How to Reproduce
+```bash
+cd benchmark/mamba2
+python - <<'PY'
+from benchmark_mamba_chunk_scan import chunk_scan_fwd
+batch = 8
+heads = 80
+groups = 1
+chunk_size = 256
+dim = 64
+dstate = 128
+for seq_len in [1024, 2048, 4096, 8192, 16384, 32768]:
+    res = chunk_scan_fwd(
+            batch,
+            seq_len,
+            chunk_size,
+            groups,
+            heads,
+            dim,
+            dstate)
+    tflops = (2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate) / res.latency * 1e-9
+    print(f"seq_len={seq_len:5d}  latency={res.latency:.6f}ms  TFlops={tflops:.3f}")
+PY
+```
+## Results
+| Seq_len| Latency (ms) | Throughput (TFLOPs) |
+|-------|-------------|---------------------|
+|  1024 | 0.169    | 126.477                 |
+|  2048 | 0.329    | 130.195                 |
+|  4096 | 0.645    | 133.054                 |
+|  8192 | 1.278    | 134.362                 |
+| 16384 | 2.531    | 135.711                 |
+| 32768 | 5.076    | 135.379                 |
+## Compare with Baselines
+- Triton: v3.5.0, mamba-ssm: v2.2.6.post3
+- Helion: v0.2.1
+<figure style="text-align: center">
+  <a href="mamba_benchmark_result.png">
+    <img src="mamba_benchmark_result.png" alt="Mamba2_chunk_scan Performance Comparison on H100">
+   </a>
+  <figcaption style="text-align: center;">Performance comparison across compilers on NVIDIA H100</figcaption>
+</figure>
\ No newline at end of file
--- a/benchmark/mamba2/benchmark_mamba_chunk_scan.py
+++ b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
+import argparse
+import torch
+import tilelang
+from tilelang.autotuner import *
+import tilelang.language as T
+from einops import rearrange, repeat
+import itertools
+import math
+from tilelang.profiler import do_bench
+try:
+    from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd
+except ImportError as err:
+    raise ImportError("Please install mamba-ssm to use the triton chunk scan operator.") from err
+try:
+    import helion
+    from helion._testing import run_example
+    import helion.language as hl
+except ImportError as err:
+    raise ImportError("Please install helion to use the helion chunk scan operator.") from err
+def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
+    """
+    Argument:
+        cb: (batch, nchunks, ngroups, chunk_size, chunk_size)
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, nheads, nchunks, chunk_size)
+        dA_cumsum: (batch, nheads, nchunks, chunk_size)
+        C: (batch, seqlen, ngroups, dstate)
+        prev_states: (batch, nchunks, nheads, headdim, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    _, _, ngroups, _, _ = cb.shape
+    batch, seqlen, nheads, headdim = x.shape
+    # _, _, ngroups, dstate = B.shape
+    # assert B.shape == (batch, seqlen, ngroups, dstate)
+    _, _, nchunks, chunk_size = dt.shape
+    assert seqlen == nchunks * chunk_size
+    # assert C.shape == B.shape
+    # B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
+    C = repeat(C, "b l g d -> b l (g h) d", h=nheads // ngroups)
+    cb = repeat(cb, "b c g l s -> b c (g h) l s", h=nheads // ngroups)
+    # CB = torch.einsum("bclhn,bcshn->bchls", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks),
+    #                   rearrange(B, "b (c s) h n -> b c s h n", c=nchunks))
+    # (batch, nheads, nchunks, chunksize, chunksize)
+    dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
+    decay = torch.exp(dt_segment_sum)
+    scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
+    causal_mask = torch.tril(
+        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    scores_decay = scores_decay.masked_fill(~causal_mask, 0)
+    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
+                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
+    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
+        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out = out + out_prev
+    out = rearrange(out, "b c l h p -> b (c l) h p")
+    if D is not None:
+        if D.dim() == 1:
+            D = rearrange(D, "h -> h 1")
+        out = out + x * D
+    return out
+def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
+    out, _ = _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D)
+    return out
+def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
+    @helion.kernel()
+    def helion_mamba2_chunk_scan_kernel(
+        cb: torch.Tensor,
+        x: torch.Tensor,
+        dt: torch.Tensor,
+        dA_cumsum: torch.Tensor,
+        C: torch.Tensor,
+        prev_states: torch.Tensor,
+        D: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Argument:
+            cb: (batch, nchunks, ngroups, chunk_size, chunk_size)
+            x: (batch, seqlen, nheads, headdim)
+            dt: (batch, nheads, nchunks, chunk_size)
+            dA_cumsum: (batch, nheads, nchunks, chunk_size)
+            C: (batch, seqlen, ngroups, dstate)
+            prev_states: (batch, nchunks, nheads, headdim, dstate)
+            D: (nheads,)
+        Return:
+            out: (batch, seqlen, nheads, headdim)
+        """
+        batch, nchunks, ngroups, chunk_size, _ = cb.shape
+        _, seqlen, nheads, headdim = x.shape
+        _, _, _, dstate = C.shape
+        assert nchunks == (seqlen + chunk_size - 1) // chunk_size
+        block_m = hl.register_block_size(chunk_size)
+        block_n = hl.register_block_size(headdim)
+        block_k = hl.register_block_size(64, 64)
+        dstate = hl.specialize(dstate)
+        assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+        assert x.shape == (batch, seqlen, nheads, headdim)
+        assert dt.shape == (batch, nheads, nchunks, chunk_size)
+        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+        assert C.shape == (batch, seqlen, ngroups, dstate)
+        assert prev_states.shape == (batch, nchunks, nheads, headdim, dstate)
+        assert D.shape == (nheads,)
+        dtype = cb.dtype
+        accum_dtype = torch.float32
+        assert (x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype ==
+                dtype)
+        out = torch.empty_like(x)
+        p = 1.44269504
+        for tile_h, tile_m, tile_n, tile_b, tile_c in hl.tile(
+            [nheads, chunk_size, headdim, batch, nchunks],
+                block_size=[1, block_m, block_n, 1, 1],
+        ):
+            acc_o = hl.zeros([tile_m, tile_n], dtype=accum_dtype)
+            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
+                                          tile_m].to(torch.float32)
+            scale_m_local = torch.exp2(dA_cumsum_local_m * p)
+            C_local = C[
+                tile_b.begin,
+                tile_m.index + tile_c.begin * chunk_size,
+                tile_h.begin // (nheads // ngroups),
+                :,
+            ]
+            prev_states_local = prev_states[tile_b.begin, tile_c.begin, tile_h.begin, tile_n, :]
+            acc_o = hl.dot(C_local, prev_states_local.T, acc=acc_o)
+            acc_o *= scale_m_local[:, None]
+            for tile_k in hl.tile((tile_m.id + 1) * block_m, block_size=block_k):
+                cb_local = cb[
+                    tile_b.begin,
+                    tile_c.begin,
+                    tile_h.begin // (nheads // ngroups),
+                    tile_m,
+                    tile_k,
+                ]
+                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
+                                              tile_k].to(torch.float32)
+                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p -
+                                       dA_cumsum_local_k[None, :] * p)
+                dt_local = dt[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
+                cb_local = (cb_local * dt_local[None, :]).to(dtype)
+                pred = (tile_m.index + 0)[:, None] >= (tile_k.index + 0)[None, :]
+                cb_local = torch.where(pred, cb_local, torch.zeros_like(cb_local))
+                x_local = x[
+                    tile_b.begin,
+                    tile_c.begin * chunk_size + tile_k.index,
+                    tile_h.begin,
+                    tile_n,
+                ]
+                acc_o = hl.dot(cb_local, x_local, acc=acc_o)
+            D_local = D[tile_h.begin].to(torch.float32)
+            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
+                           tile_n].to(torch.float32)
+            acc_o += x_residual * D_local
+            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
+                tile_n] = acc_o.to(dtype=dtype)
+        return out
+    args = (cb, x, dt, dA_cumsum, C, states, D)
+    run_example(helion_mamba2_chunk_scan_kernel, ref_program, args)
+def get_configs():
+    iter_params = dict(
+        block_M=[64, 128, 256],
+        block_N=[32, 64],
+        block_K=[64, 128, 256],
+        block_Dstate=[128],
+        num_stages=[1, 2, 3, 4, 5])
+    return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
+@autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(
+    out_idx=[7],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def chunk_scan_fwd(batch,
+                   seqlen,
+                   chunk_size,
+                   ngroups,
+                   nheads,
+                   headdim,
+                   dstate,
+                   block_M=64,
+                   block_N=64,
+                   block_K=64,
+                   block_Dstate=128,
+                   num_stages=2,
+                   threads=128):
+    dtype = "float16"
+    accum_dtype = "float"
+    nchunks = T.ceildiv(seqlen, chunk_size)
+    p = 1.44269504
+    @T.prim_func
+    def main(
+            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+            D: T.Tensor((nheads), dtype),  # type: ignore
+            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+    ):
+        with T.Kernel(
+                nheads,
+                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
+                batch * nchunks,
+                threads=threads) as (bz, bx, by):
+            acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
+            acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
+            cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
+            cb_local = T.alloc_fragment((block_M, block_K), dtype)
+            dA_cs_k_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dA_cs_k_local = T.alloc_fragment((block_K), accum_dtype)
+            dA_cs_m_local = T.alloc_fragment((block_M), accum_dtype)
+            dt_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dt_local = T.alloc_fragment((block_K), accum_dtype)
+            x_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared.dyn")
+            dA_cs_m_shared = T.alloc_shared((block_M), dtype, scope="shared")
+            scale_m_local = T.alloc_fragment((block_M), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_Dstate), dtype)
+            prev_state_shared = T.alloc_shared((block_N, block_Dstate), dtype)
+            D_local = T.alloc_fragment((1), accum_dtype)
+            x_residual_shared = T.alloc_shared((block_M, block_N), dtype, scope="shared.dyn")
+            x_residual_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            batch_idx = by % batch
+            chunk_idx = by // batch
+            # m: chunk_size
+            # n : headdim
+            m_idx = bx // T.ceildiv(headdim, block_N)
+            n_idx = bx % T.ceildiv(headdim, block_N)
+            T.annotate_layout({
+                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
+                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
+            })
+            T.no_set_max_nreg()
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
+                   dA_cs_m_shared)
+            T.copy(dA_cs_m_shared, dA_cs_m_local)
+            T.clear(acc_o)
+            for i in T.Parallel(block_M):
+                scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
+            T.copy(
+                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
+                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
+            T.copy(
+                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
+                            0:block_Dstate], prev_state_shared)
+            T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_o[i, j] *= scale_m_local[i]
+            loop_range = T.ceildiv((m_idx + 1) * block_M, block_K)
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                T.copy(
+                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
+                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
+                    cb_shared)
+                T.copy(cb_shared, cb_local)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
+                       dA_cs_k_shared)
+                T.copy(dA_cs_k_shared, dA_cs_k_local)
+                for i, j in T.Parallel(block_M, block_K):
+                    cb_local[i,
+                             j] = cb_local[i,
+                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                T.copy(dt_shared, dt_local)
+                for i, j in T.Parallel(block_M, block_K):
+                    cb_local[i, j] *= dt_local[j]
+                for i, j in T.Parallel(block_M, block_K):
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
+                                                    cb_local[i, j], 0)
+                T.copy(
+                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
+                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                T.gemm(cb_local, x_shared, acc_o)
+            D_local[0] = D[bz]
+            T.copy(
+                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
+                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
+                x_residual_shared)
+            T.copy(x_residual_shared, x_residual_local)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_o[i, j] += x_residual_local[i, j] * D_local[0]
+            T.copy(acc_o, acc_o_shared)
+            T.copy(
+                acc_o_shared,
+                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
+                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+    return main
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch', type=int, default=8, help='batch size')
+    parser.add_argument('--heads', type=int, default=80, help='heads')
+    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
+    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
+    parser.add_argument('--dim', type=int, default=64, help='dim')
+    parser.add_argument('--dstate', type=int, default=128, help='dstate')
+    parser.add_argument('--tune', action='store_true', help='tune configs')
+    args = parser.parse_args()
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    nchunks = math.ceil(seq_len / chunk_size)
+    total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
+    print("Benchmarking TileLang...")
+    kernel = chunk_scan_fwd(batch, seq_len, chunk_size, groups, heads, dim, dstate)
+    best_latency = kernel.latency
+    best_config = kernel.config
+    ref_latency = kernel.ref_latency
+    print(f"Best latency: {best_latency}")
+    print(f"Best TFlops: {total_flops / best_latency * 1e-9}")
+    print(f"Best config: {best_config}")
+    cb = torch.randn(batch, nchunks, groups, chunk_size, chunk_size).half().cuda()
+    x = torch.randn(batch, seq_len, heads, dim).half().cuda()
+    dt = torch.randn(batch, heads, nchunks, chunk_size).half().cuda()
+    dA_cumsum = torch.randn(batch, heads, nchunks, chunk_size).half().cuda()
+    C = torch.randn(batch, seq_len, groups, dstate).half().cuda()
+    states = torch.randn(batch, nchunks, heads, dim, dstate).half().cuda()
+    D = torch.randn(heads).half().cuda()
+    print("Benchmarking Triton...")
+    triton_latency = do_bench(
+        lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
+    print(f"Triton TFlops: {total_flops / triton_latency * 1e-9}")
+    print("Benchmarking Helion...")
+    chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D)
--- a/benchmark/mamba2/mamba_benchmark_result.png
+++ b/benchmark/mamba2/mamba_benchmark_result.png
--- a/benchmark/matmul/README.md
+++ b/benchmark/matmul/README.md
+# FP16 Matmul Benchmark (8192×8192)
+This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP16 matrices sized `M = N = 8192` across different `K` dimensions using the default autotuning search space.
+## Environment
+- Repository commit: `17bd0a6c651f599bec1397e0b91830c3ddc93076`
+- GPUs: `NVIDIA H800 SXM` on driver `560.35.05`
+## How to Reproduce
+```bash
+cd benchmark/matmul
+python - <<'PY'
+from benchmark_matmul import matmul
+M = 8192
+N = 8192
+for K in [256, 512, 1024, 2048, 4096, 8192, 16384]:
+    res = matmul(M, N, K, False)
+    tflops = 2 * M * N * K / res.latency * 1e-12
+    print(f"K={K:5d}  latency={res.latency:.6f}s  TFlops={tflops:.3f}")
+PY
+```
+## Results
+| K     | Latency (s) | Throughput (TFLOPs) |
+|-------|-------------|---------------------|
+|   256 | 0.089056    | 386                 |
+|   512 | 0.132064    | 520                 |
+|  1024 | 0.218816    | 628                 |
+|  2048 | 0.390112    | 705                 |
+|  4096 | 0.746752    | 736                 |
+|  8192 | 1.449888    | 758                 |
+| 16384 | 2.871168    | 766                 |
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
+import argparse
+import itertools
+import logging
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+from tilelang import jit
+# Configure logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def ref_program(A, B):
+    """
+    A reference matrix multiplication program, used to compare performance.
+    Parameters
+    ----------
+    A : numpy.ndarray
+        The matrix with shape (M, K).
+    B : numpy.ndarray
+        The matrix with shape (N, K).
+    Returns
+    -------
+    np.ndarray
+        The result of A @ B.T, shape (M, N).
+    """
+    return A @ B.T
+def get_configs(args, kwargs):
+    """
+    Generate a list of configuration dictionaries that will be used for tuning.
+    Parameters
+    ----------
+    with_roller : bool
+        Whether to enable bitblas roller to deduce search spaces
+    Returns
+    -------
+    list of dict
+        Each configuration dict includes various block sizes, pipeline stages,
+        thread numbers, and other parameters to explore during autotuning.
+    """
+    M, N, K, with_roller = args[:4]
+    if with_roller:
+        from tilelang.carver.template import MatmulTemplate
+        from tilelang.carver.arch import CUDA
+        from tilelang.carver.arch import CDNA
+        from tilelang.carver.roller.rasterization import NoRasterization
+        import torch
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
+        topk = 10
+        carve_template = MatmulTemplate(
+            M=M,
+            N=N,
+            K=K,
+            in_dtype="float16",
+            out_dtype="float16",
+            accum_dtype="float",
+        ).with_arch(arch)
+        func = carve_template.equivalent_function()
+        assert func is not None, "Function is None"
+        roller_hints = carve_template.recommend_hints(topk=topk)
+        if roller_hints is None:
+            raise ValueError("No Roller Hints Found for TensorCore Scheduling")
+        configs = []
+        for hint in roller_hints:
+            config = {}
+            block_m, block_n = hint.block
+            warp_m, warp_n = hint.warp
+            # block_rows, block_cols represents warp partitioning
+            block_rows, block_cols = block_m // warp_m, block_n // warp_n
+            config["block_M"] = block_m
+            config["block_N"] = block_n
+            config["block_K"] = hint.rstep[0]
+            config["num_stages"] = hint.pipeline_stage
+            config["thread_num"] = block_rows * block_cols * 32
+            config["policy"] = T.GemmWarpPolicy.from_warp_partition(block_rows, block_cols)
+            config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
+            configs.append(config)
+        for config in configs:
+            print(config)
+    else:
+        iter_params = dict(
+            block_M=[64, 128, 256],
+            block_N=[64, 128, 256],
+            block_K=[32, 64],
+            num_stages=[0, 1, 2, 3],
+            thread_num=[128, 256],
+            policy=[T.GemmWarpPolicy.Square],
+            enable_rasteration=[True, False],
+        )
+        return [{
+            k: v for k, v in zip(iter_params, values)
+        } for values in itertools.product(*iter_params.values())]
+    return configs
+@autotune(
+    configs=get_configs,
+    warmup=3,
+    rep=20,
+)
+@jit(out_idx=[2],)
+def matmul(
+    M,
+    N,
+    K,
+    with_roller,
+    block_M=None,
+    block_N=None,
+    block_K=None,
+    num_stages=None,
+    thread_num=None,
+    policy=None,
+    enable_rasteration=None,
+):
+    """
+    Create an autotuned matrix multiplication kernel for matrices of shape:
+      - A: (M, K)
+      - B: (N, K)
+      - C: (M, N)
+    Parameters
+    ----------
+    M : int
+        The dimension M of the matrix multiplication.
+    N : int
+        The dimension N of the matrix multiplication.
+    K : int
+        The dimension K of the matrix multiplication.
+    Returns
+    -------
+    (best_latency, best_config, ref_latency)
+        best_latency : float
+            The best latency found among the tuned configurations.
+        best_config : dict
+            The parameter configuration that yielded best_latency.
+        ref_latency : float
+            The baseline latency of the reference program (for computing speedup).
+    """
+    # Use half-precision for input data to reduce memory bandwidth,
+    # accumulate in float for better numerical accuracy
+    dtype = "float16"
+    accum_dtype = "float"
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        """
+        The compiled TVM function for block-level matrix multiplication.
+        - We divide the entire (M, N) domain into blocks of shape
+            (block_M, block_N).
+        - Each block has its own allocated shared memory for sub-blocks
+            of A and B.
+        - The partial results go into C_local, and then we copy them back
+            to global memory C.
+        """
+        # Bind x-dimension to block index in N,
+        #     y-dimension to block index in M.
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            # Allocate shared memory for A sub-block of shape (block_M, block_K)
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            # Allocate shared memory for B sub-block of shape (block_N, block_K)
+            B_shared = T.alloc_shared((block_N, block_K), dtype)
+            # Allocate a local fragment for intermediate accumulation
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            # Allocate a shared memory for C sub-block of shape (block_M, block_N)
+            C_shared = T.alloc_shared((block_M, block_N), dtype)
+            # Enable (or disable) swizzling optimization
+            T.use_swizzle(panel_size=10, enable=enable_rasteration)
+            # to utilize swizzle tma layout
+            T.annotate_layout({C_shared: tilelang.layout.make_swizzled_layout(C_shared)})
+            # Clear out the accumulation buffer
+            T.clear(C_local)
+            # Loop over sub-blocks in K dimension, pipelined by num_stages
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                # Load a sub-block of A from global memory into A_shared
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                # Load a sub-block of B from global memory into B_shared
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                # Perform a partial matrix multiplication:
+                #   C_local += A_shared @ B_shared^T
+                T.gemm(
+                    A_shared,
+                    B_shared,
+                    C_local,
+                    transpose_B=True,
+                    policy=policy,
+                )
+            # Write back the results from C_local to the global memory C
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+    return main
+if __name__ == "__main__":
+    # Parse command-line arguments for matrix dimensions
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument(
+        "--with_roller",
+        action="store_true",
+        help="Whether to enable BitBLAS roller for search space",
+    )
+    args = parser.parse_args()
+    M, N, K = args.m, args.n, args.k
+    with_roller = args.with_roller
+    # Compute total floating-point operations to measure throughput
+    total_flops = 2 * M * N * K
+    # matmul(...) returns (best_latency, best_config, ref_latency)
+    best_result = matmul(M, N, K, with_roller)
+    best_latency = best_result.latency
+    best_config = best_result.config
+    ref_latency = best_result.ref_latency
+    # Print out the benchmark results
+    print(f"Best latency (s): {best_latency}")
+    print(f"Best TFlops: {total_flops / best_latency * 1e-9:.3f}")
+    print(f"Best config: {best_config}")
+    if ref_latency is not None:
+        print(f"Reference TFlops: {total_flops / ref_latency * 1e-9:.3f}")