diff --git a/.clang-tidy b/.clang-tidy
index 1681ed66e854e010194c2a6694be552d97346813..f9b77bce8a0c2e6640e7d5f72a004527256ff510 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,9 @@ ExtraArgs: []
 FormatStyle: file
 UseColor: true
 WarningsAsErrors: '*'
+# FIXME: Use `ExcludeHeaderFilterRegex` instead when all maintainers upgraded their `clang-tidy`
 HeaderFilterRegex: '^(?!.*(?:/|^)(3rdparty|tvm)/).*'
+# ExcludeHeaderFilterRegex: '^(3rdparty|tvm)/.*$'
 
 # NOTE: there must be no spaces before the '-', so put the comma last.
 Checks: >-
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 3ba13e0cec6cbbfd462e9ebf529dd2093148cd69..0086358db1eb971c0cfa8739c27518bbc18a5ff4 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1 @@
-blank_issues_enabled: false
+blank_issues_enabled: true
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f9fe32861d3e451a426d8715f69601025a095f4e..d7abaeb0f1baf93c416675abea4848d541ba35c4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,7 +40,7 @@ jobs:
     timeout-minutes: 30
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
@@ -93,7 +93,7 @@ jobs:
             name: self-hosted-amd
             # Format: [Nightly-]ROCm-<major>.<minor>[.<patch>]. E.g., "ROCm-6.4" or "Nightly-ROCm-7.0".
             # Use "Nightly-" prefix to use torch nightly builds.
-            toolkit: ROCm-6.3
+            toolkit: Nightly-ROCm-7.1
           - tags: [macos-latest]
             name: macos-latest
             toolkit: Metal # or Nightly-Metal
@@ -104,7 +104,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
@@ -288,35 +288,59 @@ jobs:
           echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
           uv cache clean
 
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
+
+      - name: Install project (wheel form)
+        run: |
+          uv pip install -v .
+
       - name: Run clang-tidy
         id: clang-tidy
         if: runner.os == 'Linux'
         run: |
           echo "\$ $(command -v clang-tidy) --version" && clang-tidy --version
 
-          if [[ -x "$(command -v run-clang-tidy)" ]]; then
-            echo "Using run-clang-tidy from $(command -v run-clang-tidy)"
-            CLANG_TIDY=(run-clang-tidy)
-          else
-            RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
-            echo "Downloading run-clang-tidy script from ${RCT_URL}"
-            echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script -
-            CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
-          fi
+          # Download run-clang-tidy script
+          RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+          echo "Downloading run-clang-tidy script from ${RCT_URL}"
+          echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script -
+          RUN_CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
+
           if [[ -x "$(command -v clang-apply-replacements)" ]]; then
             echo "Using clang-apply-replacements from $(command -v clang-apply-replacements)"
-            CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
+            RUN_CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
           else
             echo "::warning::clang-apply-replacements not found in PATH, automatic fixing disabled."
           fi
 
           # Run cmake to create the build directory with compile_commands.json
           cmake -S . -B cmake-build --fresh ${CLANG_TIDY_CMAKE_OPTIONS}  # no quotes here
+          echo "::group::compile_commands.json"
+          ls -alh cmake-build/compile_commands.json
+          uv run --no-project -m -- json.tool --no-ensure-ascii cmake-build/compile_commands.json
+          echo "::endgroup::"
 
           CXX_FILES=$(find src -type f -iname "*.[ch]pp" -o -iname "*.cc" -o -iname "*.c" -o -iname "*.h")
           rc=0
-          "${CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+          echo "::group::run-clang-tidy"
+          "${RUN_CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+            -exclude-header-filter='^(3rdparty|tvm)/.*$' \
             -p="cmake-build" ${CXX_FILES} || rc="$?"
+          echo "::endgroup::"
           rm -rf cmake-build run-clang-tidy.py
           if (( rc != 0 )); then
             echo "::error::clang-tidy found issues (exit code: ${rc}). Please run 'clang-tidy --fix' locally to fix them."
@@ -324,26 +348,6 @@ jobs:
             exit "${rc}"
           fi
 
-      - name: Enable core dump generation (Linux / GitHub-hosted runners)
-        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
-        run: |
-          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
-          sudo sysctl -w kernel.core_uses_pid=0
-          sudo sysctl -w fs.suid_dumpable=1
-          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
-
-      - name: Enable core dump generation (macOS / GitHub-hosted runners)
-        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
-        run: |
-          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
-          sudo sysctl -w kern.coredump=1
-          sudo sysctl -w kern.sugid_coredump=1
-          sysctl kern.corefile kern.coredump kern.sugid_coredump
-
-      - name: Install project (wheel form)
-        run: |
-          uv pip install -v .
-
       - name: Run examples with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
         if: contains(matrix.runner.toolkit, 'CUDA')
         run: |
@@ -366,8 +370,27 @@ jobs:
             pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
           )
           "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
+            --ignore=./python/jit/test_tilelang_jit_cutedsl.py \
             ./python
 
+      # CuTeDSL JIT tests require GEMM v1 (must be set before importing tilelang).
+      # Run them in a dedicated step to avoid changing the default GEMM selection
+      # (and to keep the rest of the CUDA tests on GEMM v2).
+      - name: Run CuTeDSL JIT tests (GEMM v1) with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
+        id: cutedsl-tests
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        env:
+          TILELANG_USE_GEMM_V1: "1"
+        run: |
+          cd testing
+          PYTEST=(
+            uv run --no-project -m --
+            pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
+          )
+          # Avoid xdist contention on a single GPU by running this file in one worker.
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=1 \
+            ./python/jit/test_tilelang_jit_cutedsl.py
+
       # AMD ROCm tests
       - name: Run ROCm tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
         id: rocm-tests
diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 0ba3fbc30eb6d171b598bb5cde3f0832a355969f..6666871100bfe7934b31e1fa342bf0779d26d51d 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -1,5 +1,6 @@
 name: Dist
 on:
+  workflow_dispatch:
   schedule:
     # gemini said this is 6:00 china time
     - cron: "0 22 * * *"
@@ -52,7 +53,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 1
           submodules: recursive
@@ -91,7 +92,7 @@ jobs:
       - name: Upload SDist
         # Not PR to save artifact storage, as SDist is only needed for releases.
         if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: sdist
           path: dist/*.tar.gz
@@ -105,14 +106,12 @@ jobs:
     strategy:
       matrix:
         target:
-          - { runner: ubuntu-latest, toolkit: "CUDA-12.1" }
+          - { runner: ubuntu-latest, toolkit: "CUDA-12.8" }
           - { runner: ubuntu-24.04-arm, toolkit: "CUDA-12.8" }
           - { runner: macos-latest, toolkit: "Metal" }
         python-version:
           # Wheels are built with Python 3.8 Limited API, they should work with all Python >= 3.8.
           # Only build wheels against Python 3.8 Limited API to save CI resources.
-          # FIXME: Here we use Python 3.9 because our dependency `apache-tvm-ffi` claims to support
-          #        Python 3.8 but it depends on a version of `ml-dtypes` that requires Python >= 3.9.
           - "3.9"
       fail-fast: false
     timeout-minutes: 120
@@ -122,7 +121,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 1
           submodules: recursive
@@ -160,7 +159,7 @@ jobs:
           fi
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.2
+        uses: pypa/cibuildwheel@v3.3
         with:
           package-dir: .
           output-dir: wheelhouse
@@ -169,7 +168,7 @@ jobs:
       - name: Upload wheels
         # Not PR to save artifact storage, as wheels are only needed for releases.
         if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: wheels-${{ matrix.python-version }}-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}
           path: wheelhouse/*.whl
@@ -184,7 +183,7 @@ jobs:
     timeout-minutes: 15
     steps:
       - name: Download built SDist
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           # unpacks default artifact into dist/
           # if `name: artifact` is omitted, the action will create extra parent dir
@@ -192,7 +191,7 @@ jobs:
           path: dist
 
       - name: Download built wheels
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: wheels-*
           path: dist
@@ -202,7 +201,7 @@ jobs:
         run: ls -lh dist/*
 
       - name: Upload artifacts
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: artifacts
           path: dist/*
diff --git a/.github/workflows/pr-perfbench-bot.yml b/.github/workflows/pr-perfbench-bot.yml
index 37da4e3c8f29d69b99fe967faad4bc9295efd305..e6954bcc45b08eb60b5d2ba83f26f18ce02635cb 100644
--- a/.github/workflows/pr-perfbench-bot.yml
+++ b/.github/workflows/pr-perfbench-bot.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: [self-hosted, nvidia]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           ref: refs/pull/${{ github.event.issue.number }}/merge
           fetch-depth: 0
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index 953303102c1c38d2879f96a938e07e66b10e6e42..2197015b66df1c35babf2a941a9dced814ef5699 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: [self-hosted, nvidia]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
diff --git a/.gitignore b/.gitignore
index 752f6cb766e9cf8fbcbece24a60c6e730b0d4b8f..8dc9fd8804d20688d64170e84f387467e235afba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,15 @@ cmake-build-*/
 
 # pre-commit cache
 .pre-commit-cache/*
+
+# host checks logs
+maint/host_checks/logs/*
+
+# ncu
+*.ncu-rep
+
+# csv
+*.csv
+
+# clang-tidy
+/run-clang-tidy.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 615f173b9973ae99a8d9699638adb3d67c110452..3504adb6d1b417b628080255c497d681e0df512d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,30 +32,17 @@ repos:
         args: [--ignore-case]
         files: ^docs/spelling_wordlist\.txt$
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v21.1.2  # sync with requirements-lint.txt
+    rev: v21.1.7  # sync with requirements-lint.txt
     hooks:
       - id: clang-format
-        exclude: |
-          (?ix)(
-            ^.+\.(cu|cuh)$|
-            ^.+\.json$
-          )
+        types_or: [c++, c]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.3  # sync with requirements-lint.txt
+    rev: v0.14.9  # sync with requirements-lint.txt
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
-  - repo: https://github.com/google/yapf
-    rev: v0.43.0  # sync with requirements-lint.txt
-    hooks:
-      - id: yapf
-        name: yapf-multiproc-bugfix
-        # yapf is not multiprocess safe, so we run a dummy yapf first.
-        args: [--in-place, docs/conf.py]
-        always_run: true
-        pass_filenames: false
-      - id: yapf
-        args: [--recursive, --in-place]
+      - id: ruff-format
+        args: [--exit-non-zero-on-format]
   - repo: https://github.com/codespell-project/codespell
     rev: v2.4.1  # sync with requirements-lint.txt
     hooks:
diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index 1c45ca35dd5c215e0c1db1f40f01556f467f52a8..b38bb492a1a55b5abb0c345962143c0f9c482cfb 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit 1c45ca35dd5c215e0c1db1f40f01556f467f52a8
+Subproject commit b38bb492a1a55b5abb0c345962143c0f9c482cfb
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 093b2cdb2187140b197336496d65d61ace89e8ff..79ed747db67e60d3a1889d8afd33473bc2424ade 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 093b2cdb2187140b197336496d65d61ace89e8ff
+Subproject commit 79ed747db67e60d3a1889d8afd33473bc2424ade
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72e1d9795d35e95af35842f439051f720802bccf..7af7f854fd2e8918adc9ffc160760104abc8e61d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,14 +136,21 @@ file(GLOB TILE_LANG_SRCS
   src/*.cc
   src/layout/*.cc
   src/transform/*.cc
+  src/transform/common/*.cc
   src/op/*.cc
   src/target/utils.cc
+  src/target/codegen_c_host.cc
   src/target/codegen_cpp.cc
   src/target/rt_mod_cpp.cc
   # intrin_rule doesn't have system dependency
   src/target/intrin_rule*.cc
 )
 
+# Always include CPU-safe runtime helpers
+list(APPEND TILE_LANG_SRCS
+  src/runtime/error_helpers.cc
+)
+
 # Track if the user explicitly selected a backend via cache options.
 set(TILELANG_BACKEND_USER_SELECTED OFF)
 foreach(BACKEND IN LISTS TILELANG_BACKENDS)
@@ -205,16 +212,28 @@ elseif(USE_CUDA)
   cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA)
 
   file(GLOB TILE_LANG_CUDA_SRCS
-    src/runtime/*.cc
+    src/runtime/runtime.cc
     src/target/ptx.cc
     src/target/codegen_cuda.cc
+    src/target/codegen_py.cc
+    src/target/codegen_utils.cc
+    src/target/codegen_cutedsl.cc
     src/target/rt_mod_cuda.cc
+    src/target/rt_mod_cutedsl.cc
   )
   list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})
 
   list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
 endif()
 
+set(USE_Z3      ON CACHE STRING "Use Z3 SMT solver for TileLang optimizations")
+set(USE_PYPI_Z3 ON CACHE BOOL   "Use Z3 provided by PyPI z3-solver package")
+
+if(USE_Z3 AND USE_PYPI_Z3)
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/pypi-z3")
+  find_package(Z3 REQUIRED)
+endif()
+
 # Include tvm after configs have been populated
 add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 
@@ -222,7 +241,11 @@ add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 add_compile_definitions(DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 
 add_library(tilelang_objs OBJECT ${TILE_LANG_SRCS})
+
+# Set debug mode compile definitions
+# We open the deubg option of TVM, i.e. TVM_LOG_DEBUG
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  message(STATUS "Building TileLang with DEBUG mode")
   target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
 endif()
 
@@ -232,6 +255,18 @@ add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
 add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
 target_link_libraries(tilelang PUBLIC tvm_runtime tvm)
 target_link_libraries(tilelang_module PUBLIC tvm)
+
+# Place dev build outputs under build/lib for consistency
+set_target_properties(tilelang PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+set_target_properties(tilelang_module PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
 # Build cython extension
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
 
@@ -251,25 +286,46 @@ if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
 endif()
 
 python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
-# Install extension into the tilelang package directory
+
+# Ensure dev builds drop the extension into build/lib alongside other shared libs
+set_target_properties(tilelang_cython_wrapper PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+# Install the extension into tilelang/lib inside the wheel
 install(TARGETS tilelang_cython_wrapper
-        LIBRARY DESTINATION tilelang
-        RUNTIME DESTINATION tilelang
-        ARCHIVE DESTINATION tilelang)
+        LIBRARY DESTINATION tilelang/lib
+        RUNTIME DESTINATION tilelang/lib
+        ARCHIVE DESTINATION tilelang/lib)
+
+# add python z3 lib path to rpath for running tests and dev in current folder
+if(USE_Z3 AND USE_PYPI_Z3)
+  set_property(TARGET tvm APPEND PROPERTY BUILD_RPATH ${Python3_SITELIB}/z3/lib)
+  set_property(TARGET tvm APPEND PROPERTY BUILD_RPATH ${Python3_SITELIB}/z3/bin)
+endif()
 
-# let libtilelang to search tvm/tvm_runtime in same dir
 if(APPLE)
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tvm PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
+  set(TILELANG_INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    # some z3 is placed in lib/ and some in bin/, we add both in rpath
+    list(APPEND TILELANG_INSTALL_RPATH "@loader_path/../../z3/lib" "@loader_path/../../z3/bin")
+  endif()
 elseif(UNIX)
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tvm PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
+  set(TILELANG_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    # cmake uses ; by default, we explicitly use : for linux
+    string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../z3/lib")
+  endif()
 endif()
 
+# let libtilelang to search tvm/tvm_runtime in same dir
+set_target_properties(tilelang        PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+set_target_properties(tvm             PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+set_target_properties(tvm_runtime     PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+
 install(
   TARGETS tvm tvm_runtime tilelang_module tilelang
   LIBRARY DESTINATION tilelang/lib
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e4b45e24bba8eac2bb486ca8e2d4d4c27d93d2e3..7edc20f6a70ec6c8957d17a2ed93de339ff1e528 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,6 +81,8 @@ in the main directory. This installation is removable by:
 python3 -m pip uninstall tilelang
 ```
 
+We also recommend installing TileLang in a more manual way for better control over the build process, by compiling the C++ extensions first and set the `PYTHONPATH`. See [Working from Source via `PYTHONPATH`](https://tilelang.com/get_started/Installation.html#working-from-source-via-pythonpath) for detailed instructions.
+
 ## Lint Check
 
 To check the linting, run:
diff --git a/README.md b/README.md
index d7cdabee535a4f1b113a8da4d1d435e027edd175..131c8c047402ddb679848fd00f09565b5539fe17 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,9 @@ Tile Language (**tile-lang**) is a concise domain-specific language designed to
 <img src=./images/MatmulExample.png />
 
 ## Latest News
+- 12/18/2025 🚀: Added [CuTeDSL backend](https://github.com/tile-ai/tilelang/pull/1421) support, enabling compilation to NVIDIA CUTLASS CuTe DSL! Join us in building and optimizing this exciting new backend: [Issue #1454](https://github.com/tile-ai/tilelang/issues/1454).
+- 12/17/2025 🔬: Integrated [Z3 theorem prover](https://github.com/tile-ai/tilelang/pull/1367) into TVM Arith Analyzer, bringing SMT-based symbolic reasoning for enhanced optimizations and automatic correctness verification!
+- 10/31/2025 🔧: Migrated to [apache-tvm-ffi](https://github.com/tile-ai/tilelang/pull/1108), significantly reducing CPU overhead!
 - 10/30/2025 📦: We have released v0.1.6.post2, which is the last version compatible with Python 3.8.
 - 10/07/2025 🍎: Added Apple Metal Device support, check out [Pull Request #799](https://github.com/tile-ai/tilelang/pull/799) for details.
 - 09/29/2025  🎉: Thrilled to announce that ​​AscendC​​ and ​Ascend​NPU IR​​ backends targeting Huawei Ascend chips are now supported!
@@ -137,7 +140,7 @@ import tilelang.language as T
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
 
     @T.prim_func
     def matmul_relu_kernel(
@@ -209,7 +212,7 @@ torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
 print("Kernel output matches PyTorch reference.")
 
 # 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
+# cuda_source = matmul_relu_kernel.get_kernel_source()
 # print("Generated CUDA kernel:\n", cuda_source)
 
 # 5.Profile latency with kernel
diff --git a/VERSION b/VERSION
index 5ed6219f48c674c08b93196c366173103fa3076c..11808190d4b90b20fe074a2dad43af6c0c1427ee 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.6.post2
+0.1.7
diff --git a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
index 6401276ac08c07469d8b25fb60b41580221cf308..3dd82aa5e5218cf37379ed69a2ff93ba1020c199 100644
--- a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
@@ -7,10 +7,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -28,15 +25,15 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         import flash_attn
 
diff --git a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
index aefe4d4205db2254b3f2649c11f1fe6a42ffa3de..e645ae1475bd7bc89f6b08a5c110bcd459d9e3d5 100644
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -39,16 +36,15 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -60,11 +56,10 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -79,22 +74,24 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -114,22 +111,21 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -144,7 +140,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -153,20 +149,19 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k]:
                         MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
                         MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -175,26 +170,23 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
-        program = blocksparse_flashattn(
-            BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+        program = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
         kernel = tilelang.compile(program, out_idx=4)
 
         def benchmark_fn():
diff --git a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
index e4828ce5f6a44f9ed86ce4d9a9446d047bee8814..85d754ae3a77f679a9d714eb1c2f83573a47c911 100644
--- a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -31,39 +28,37 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
         def benchmark_fn():
             # Compute reference
             # Expand block mask to full attention matrix
-            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
             full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
             full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
             # PyTorch reference implementation
-            attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-            attn = attn.masked_fill(~full_mask, float('-inf'))
+            attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+            attn = attn.masked_fill(~full_mask, float("-inf"))
             attn = F.softmax(attn, dim=-1)
-            ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+            ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
             return ref_output
 
         ref_latency = do_bench(
diff --git a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
index 86ac894bc7197669c77f95bf96a70f173166b051..7ebca93a6a3735ac0fae1cdcd706587d3521e5b6 100644
--- a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +53,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -72,8 +68,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -153,7 +148,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -191,24 +186,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -253,7 +236,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,24 +253,22 @@ block_sparse_triton_fn = _sparse_attention.apply
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
diff --git a/benchmark/mamba2/benchmark_mamba_chunk_scan.py b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
index aff810f660ceeb0716bf07300c3b3801ecbaea9e..c9f5cec670a40aad2e62d78847613277c8827648 100644
--- a/benchmark/mamba2/benchmark_mamba_chunk_scan.py
+++ b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
@@ -51,14 +51,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -74,7 +75,6 @@ def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
 
 
 def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
-
     @helion.kernel()
     def helion_mamba2_chunk_scan_kernel(
         cb: torch.Tensor,
@@ -118,8 +118,7 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
 
         dtype = cb.dtype
         accum_dtype = torch.float32
-        assert (x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype ==
-                dtype)
+        assert x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype == dtype
 
         out = torch.empty_like(x)
 
@@ -127,11 +126,10 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
 
         for tile_h, tile_m, tile_n, tile_b, tile_c in hl.tile(
             [nheads, chunk_size, headdim, batch, nchunks],
-                block_size=[1, block_m, block_n, 1, 1],
+            block_size=[1, block_m, block_n, 1, 1],
         ):
             acc_o = hl.zeros([tile_m, tile_n], dtype=accum_dtype)
-            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
-                                          tile_m].to(torch.float32)
+            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_m].to(torch.float32)
             scale_m_local = torch.exp2(dA_cumsum_local_m * p)
 
             C_local = C[
@@ -152,10 +150,8 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
                     tile_m,
                     tile_k,
                 ]
-                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
-                                              tile_k].to(torch.float32)
-                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p -
-                                       dA_cumsum_local_k[None, :] * p)
+                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
+                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p - dA_cumsum_local_k[None, :] * p)
                 dt_local = dt[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
                 cb_local = (cb_local * dt_local[None, :]).to(dtype)
                 pred = (tile_m.index + 0)[:, None] >= (tile_k.index + 0)[None, :]
@@ -169,11 +165,9 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
                 acc_o = hl.dot(cb_local, x_local, acc=acc_o)
 
             D_local = D[tile_h.begin].to(torch.float32)
-            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
-                           tile_n].to(torch.float32)
+            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n].to(torch.float32)
             acc_o += x_residual * D_local
-            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
-                tile_n] = acc_o.to(dtype=dtype)
+            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n] = acc_o.to(dtype=dtype)
 
         return out
 
@@ -182,12 +176,7 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -198,40 +187,42 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
             cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
@@ -257,27 +248,32 @@ def chunk_scan_fwd(batch,
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -286,34 +282,47 @@ def chunk_scan_fwd(batch,
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -321,24 +330,37 @@ def chunk_scan_fwd(batch,
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     nchunks = math.ceil(seq_len / chunk_size)
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
@@ -360,8 +382,7 @@ if __name__ == "__main__":
     D = torch.randn(heads).half().cuda()
 
     print("Benchmarking Triton...")
-    triton_latency = do_bench(
-        lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
+    triton_latency = do_bench(lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
     print(f"Triton TFlops: {total_flops / triton_latency * 1e-9}")
 
     print("Benchmarking Helion...")
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
index c64f4fabf8e1bfb3c5c1566cd66359710c70d552..643c1fd5e9661634a48068f6a313433ea257a179 100644
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -6,6 +6,7 @@ import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
+
 # Configure logger
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -61,9 +62,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -101,9 +102,7 @@ def get_configs(args, kwargs):
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
     return configs
 
 
@@ -112,7 +111,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -154,14 +155,14 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -176,7 +177,6 @@ def matmul(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
index 94e36b385baf111709be26bdc4b379af10f5fceb..4ef860c21039b2b3288a43f5d09bae9ed30e7a36 100644
--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -6,7 +6,8 @@ import tilelang as tl
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.autotuner import autotune
 import itertools
@@ -48,22 +49,22 @@ def tl_matmul(
     enable_rasteration=False,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     block_M = block_row_warps * warp_row_tiles
@@ -103,12 +104,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
@@ -127,7 +129,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
@@ -194,9 +194,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -223,7 +223,6 @@ def get_configs(args, kwargs):
         for config in configs:
             print(config)
     else:
-
         iter_params = dict(
             block_row_warps=[1, 2, 4],
             block_col_warps=[1, 2, 4],
@@ -233,9 +232,7 @@ def get_configs(args, kwargs):
             stage=[0, 2],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -247,14 +244,16 @@ def get_configs(args, kwargs):
     ref_prog=ref_program,
     skip_check=True,
 )
-@tl.jit(out_idx=[2],)
+@tl.jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
     K,
-    in_dtype="float16",
-    out_dtype="float16",
-    accum_dtype="float16",
+    in_dtype=T.float16,
+    out_dtype=T.float16,
+    accum_dtype=T.float16,
     with_roller=False,
     block_row_warps=None,
     block_col_warps=None,
@@ -291,19 +290,14 @@ if __name__ == "__main__":
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--with_roller",
-        type=bool,
-        default=False,
-        help="Whether to use roller to deduce search spaces")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
+    parser.add_argument("--with_roller", type=bool, default=False, help="Whether to use roller to deduce search spaces")
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
     args = parser.parse_args()
 
     M, N, K = args.m, args.n, args.k
-    in_dtype = args.dtype
-    out_dtype = "float32" if in_dtype == "int8" else "float16"
-    accum_dtype = "float32" if in_dtype == "int8" else "float16"
+    in_dtype = T.dtype(args.dtype)
+    out_dtype = T.float32 if in_dtype == T.int8 else T.float16
+    accum_dtype = T.float32 if in_dtype == T.int8 else T.float16
     with_roller = args.with_roller
     with_roller = True
     # Compute total floating-point operations
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
index 4e4ed61283d234b6433e13bda0ae7222a456c641..7ecffc26a28b9d171e70fcd1b25395708310be2e 100644
--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
@@ -9,7 +9,7 @@ import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
 from tilelang.contrib import nvcc
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 
 # Configure logger
 logger = logging.getLogger(__name__)
@@ -70,7 +70,8 @@ def get_configs(M, N, K):
             thread_num,
             policy,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -81,12 +82,13 @@ def get_configs(M, N, K):
             "thread_num": c[4],
             "policy": c[5],
             "enable_rasterization": c[6],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
 
-def matmul_sp(M, N, K, accum_dtype):
+def matmul_sp(M, N, K, in_dtype, accum_dtype):
     """
     Create an autotuned matrix multiplication kernel for matrices of shape:
       - A: (M, K)
@@ -126,7 +128,9 @@ def matmul_sp(M, N, K, accum_dtype):
         warmup=3,
         rep=20,
     )
-    @jit(out_idx=[2],)
+    @jit(
+        out_idx=[2],
+    )
     def kernel(
         block_M=None,
         block_N=None,
@@ -161,15 +165,14 @@ def matmul_sp(M, N, K, accum_dtype):
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
         e_factor, e_dtype = ARCH_INFO[arch]
 
         @T.prim_func
         def main(
-                A_sparse: T.Tensor((M, K // 2), dtype),
-                E: T.Tensor((M, K // e_factor), e_dtype),
-                B: T.Tensor((K, N), dtype),
-                C: T.Tensor((M, N), accum_dtype),
+            A_sparse: T.Tensor((M, K // 2), in_dtype),
+            E: T.Tensor((M, K // e_factor), e_dtype),
+            B: T.Tensor((K, N), in_dtype),
+            C: T.Tensor((M, N), accum_dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -183,13 +186,11 @@ def matmul_sp(M, N, K, accum_dtype):
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
-                A_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+                A_shared = T.alloc_shared((block_M, block_K // 2), in_dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
-                B_shared = T.alloc_shared((block_K, block_N), dtype)
+                B_shared = T.alloc_shared((block_K, block_N), in_dtype)
                 # Allocate shared memory for E sub-block of shape (block_M, block_K // E_factor)
                 E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
                 # Allocate a local fragment for intermediate accumulation
@@ -202,14 +203,12 @@ def matmul_sp(M, N, K, accum_dtype):
                 T.disable_warp_group_reg_alloc()
 
                 T.use_swizzle(panel_size=10, enable=enable_rasterization)
-                T.annotate_layout({
-                    E:
-                        make_metadata_layout(
-                            E, mma_dtype="float16", backend="cutlass", block_k=block_K),
-                    E_shared:
-                        make_metadata_layout(
-                            E_shared, mma_dtype="float16", backend="cutlass", block_k=block_K),
-                })
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, block_k=block_K),
+                    }
+                )
                 # Loop over sub-blocks in K dimension, pipelined by num_stages
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     # Load a sub-block of A from global memory into A_shared
@@ -220,7 +219,7 @@ def matmul_sp(M, N, K, accum_dtype):
                     T.copy(B[k * block_K, bx * block_N], B_shared)
                     # Perform a partial matrix multiplication:
                     #   C_local += A_shared @ B_shared
-                    T.gemm_sp(
+                    T.gemm_sp_v2(
                         A_shared,
                         E_shared,
                         B_shared,
@@ -244,18 +243,13 @@ if __name__ == "__main__":
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
     parser.add_argument("--disable_cache", action="store_true")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
     parser.add_argument(
         "--bench_torch_sparse",
         type=str,
-        choices=['cutlass', 'cusparselt'],
+        choices=["cutlass", "cusparselt"],
         default=None,
-        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported"
+        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported",
     )
     args = parser.parse_args()
 
@@ -268,7 +262,7 @@ if __name__ == "__main__":
     total_flops = 2 * M * N * K
 
     # matmul(...) returns (best_latency, best_config, ref_latency)
-    best_result = matmul_sp(M, N, K, args.accum_dtype)
+    best_result = matmul_sp(M, N, K, T.float16, args.accum_dtype)
     best_latency = best_result.latency
     best_config = best_result.config
     A = torch.randn(M, K, dtype=torch.float16, device="cuda")
@@ -277,7 +271,8 @@ if __name__ == "__main__":
 
     if args.bench_torch_sparse is not None:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
-        if args.bench_torch_sparse == 'cutlass':
+
+        if args.bench_torch_sparse == "cutlass":
             SparseSemiStructuredTensor._FORCE_CUTLASS = True
         A_sp = to_sparse_semi_structured(A, transposed=False)
         torch_sparse_latency = do_bench(lambda: A_sp @ B)
@@ -288,8 +283,6 @@ if __name__ == "__main__":
     print(f"Best config: {best_config}")
 
     if args.bench_torch_sparse is not None:
-        print(
-            f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}"
-        )
+        print(f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}")
 
     print(f"Reference Dense TFlops: {total_flops / ref_latency * 1e-9:.3f}")
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 36b9103555d9280f1bb6623c269b66e8fd9aba33..e2e62812fc53cafb11e6b7d433a1cb61c37076f8 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -1,5 +1,6 @@
 import argparse
 import itertools
+import torch
 import logging
 import tilelang
 import tilelang.language as T
@@ -62,9 +63,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -99,12 +100,11 @@ def get_configs(args, kwargs):
             block_K=[64, 128],
             num_stages=[0, 1, 2, 3],
             thread_num=[128, 256],
+            k_pack=[1, 2],
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -114,7 +114,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -125,6 +127,7 @@ def matmul(
     block_K=None,
     num_stages=None,
     thread_num=None,
+    k_pack=None,
     policy=None,
     enable_rasteration=None,
 ):
@@ -156,14 +159,14 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float8_e4m3"
-    accum_dtype = "float"
+    dtype = T.float8_e4m3fnuz if torch.version.hip is not None else T.float8_e4m3fn
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -178,7 +181,6 @@ def matmul(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -210,6 +212,7 @@ def matmul(
                     C_local,
                     transpose_B=True,
                     policy=policy,
+                    k_pack=k_pack,
                 )
             # Write back the results from C_local to the global memory C
             T.copy(C_local, C_shared)
diff --git a/cmake/load_tvm.cmake b/cmake/load_tvm.cmake
index f013c3ba60ea77e6a1177b1ac88b5f4d12d4cbe2..cb21be95f649174d829778855a4849ae1d87a006 100644
--- a/cmake/load_tvm.cmake
+++ b/cmake/load_tvm.cmake
@@ -3,12 +3,15 @@
 set(TVM_BUILD_FROM_SOURCE TRUE)
 set(TVM_SOURCE ${CMAKE_SOURCE_DIR}/3rdparty/tvm)
 
-if(DEFINED $ENV{TVM_ROOT})
+if(DEFINED ENV{TVM_ROOT})
   if(EXISTS $ENV{TVM_ROOT}/cmake/config.cmake)
     set(TVM_SOURCE $ENV{TVM_ROOT})
+    message(STATUS "Using TVM_ROOT from environment variable: ${TVM_SOURCE}")
   endif()
 endif()
 
+message(STATUS "Using TVM source: ${TVM_SOURCE}")
+
 set(TVM_INCLUDES
   ${TVM_SOURCE}/include
   ${TVM_SOURCE}/src
diff --git a/cmake/pypi-z3/FindZ3.cmake b/cmake/pypi-z3/FindZ3.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d7920f8f9ccc66f300ac2b1f92b5d8044e457839
--- /dev/null
+++ b/cmake/pypi-z3/FindZ3.cmake
@@ -0,0 +1,30 @@
+if(Z3_FOUND)
+    return()
+endif()
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+execute_process(
+    COMMAND "${Python3_EXECUTABLE}" -c "import z3; print(z3.__path__[0])"
+    OUTPUT_VARIABLE Z3_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE Z3_PYTHON_RESULT
+)
+if(NOT Z3_PYTHON_RESULT EQUAL 0 OR Z3_PATH STREQUAL "")
+    message(FATAL_ERROR "Failed to locate z3 Python package. Ensure z3-solver>=4.13.0 is installed.")
+endif()
+message("-- Find Z3 in path: ${Z3_PATH}")
+find_path(Z3_INCLUDE_DIR NO_DEFAULT_PATH NAMES z3++.h PATHS ${Z3_PATH}/include)
+find_library(Z3_LIBRARY NO_DEFAULT_PATH NAMES z3 libz3 PATHS ${Z3_PATH}/bin ${Z3_PATH}/lib ${Z3_PATH}/lib64)
+message("-- Found Z3 include dir: ${Z3_INCLUDE_DIR}")
+message("-- Found Z3 library: ${Z3_LIBRARY}")
+add_library(z3::libz3 SHARED IMPORTED GLOBAL)
+set_target_properties(z3::libz3
+    PROPERTIES
+    IMPORTED_LOCATION ${Z3_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${Z3_INCLUDE_DIR}
+)
+if(NOT Z3_INCLUDE_DIR OR NOT Z3_LIBRARY)
+    message(FATAL_ERROR "Could not find Z3 library or include directory")
+endif()
+set(Z3_CXX_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_C_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_FOUND TRUE)
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index f519bb0aaac6aa38f5a77795e3c57c60b3282a63..5f61f0e2e8c41f40ea2d213e07ae1e282a9f0102 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -9,23 +9,43 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
   build-essential git wget \
   libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  rocm-dev rocm-libs hip-dev hipblas-dev rocblas-dev \
   && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
 ENV PATH="/opt/conda/bin:${PATH}"
 ENV LIBGL_ALWAYS_INDIRECT=1
+ENV USE_ROCM=1
+ENV USE_CUDA=0
+ENV ROCM_HOME=/opt/rocm
+ENV HIP_PLATFORM=amd
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 
 
 RUN conda run -n py_3.10 conda install pip cmake -y && \
     conda run -n py_3.10 conda install -c conda-forge libstdcxx-ng=12 -y && \
     conda clean --all
 
-RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+RUN apt-get update && apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev && \
+    apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
-RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main tilelang && \
-    conda run -n py_3.10 bash -c "cd tilelang && USE_ROCM=1 pip install -e . -v"
+# Copy local tilelang directory instead of cloning from git
+# Build from tilelang root: docker build -f docker/Dockerfile.rocm -t mi300:latest .
+COPY . /root/tilelang
 
-RUN conda init bash
+RUN mv /opt/conda/envs/py_3.10/compiler_compat /opt/conda/envs/py_3.10/compiler_compat.bak || true && \
+    conda run -n py_3.10 bash -c "export USE_ROCM=1 USE_CUDA=0 && pip install 'numpy<2.0' --force-reinstall" && \
+    conda run -n py_3.10 bash -c "cd /root/tilelang && \
+        # Backup and modify pyproject.toml to remove torch from dependencies \
+        cp pyproject.toml pyproject.toml.bak && \
+        sed -i '/^[[:space:]]*\"torch/d' pyproject.toml && \
+        # Install tilelang with all dependencies except torch \
+        USE_ROCM=1 USE_CUDA=0 pip install -e . -v && \
+        # Restore original pyproject.toml \
+        mv pyproject.toml.bak pyproject.toml"
+
+RUN conda init bash && \
+    echo "conda activate py_3.10" >> /root/.bashrc
 
 SHELL ["/bin/bash", "-l", "-c"]
 
-CMD ["bash", "-c", "source ~/.bashrc && conda activate py_3.10 && exec bash"]
\ No newline at end of file
+ENTRYPOINT ["/bin/bash", "--login", "-i"]
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
new file mode 100644
index 0000000000000000000000000000000000000000..0ef6b48cb8b08d17ac582728af5a6f040de06ee1
--- /dev/null
+++ b/docs/_static/custom.css
@@ -0,0 +1,11 @@
+/* Reduce the displayed size of the sidebar logo in Furo */
+.sidebar-logo {
+  max-height: 125px;
+  width: auto;
+}
+
+/* Optional: keep container from growing too tall due to spacing */
+.sidebar-logo-container {
+  line-height: 0;
+}
+
diff --git a/docs/_static/img/logo-v2.png b/docs/_static/img/logo-v2.png
new file mode 100644
index 0000000000000000000000000000000000000000..410773f60a0d6ddf9bb86186ecb70529ff1d4667
Binary files /dev/null and b/docs/_static/img/logo-v2.png differ
diff --git a/docs/_static/img/logo.png b/docs/_static/img/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d04697ce4cd98d1aa6d9edc0f492601ef92c575
Binary files /dev/null and b/docs/_static/img/logo.png differ
diff --git a/docs/_static/img/sparse_mma_storage_example.png b/docs/_static/img/sparse_mma_storage_example.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b16398197b28f10b5681cfd27a9f2fe061be5cd
Binary files /dev/null and b/docs/_static/img/sparse_mma_storage_example.png differ
diff --git a/docs/compiler_internals/tensor_checks.md b/docs/compiler_internals/tensor_checks.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4d2a0b3c03048455b2e2a77e3536292d5f5a202
--- /dev/null
+++ b/docs/compiler_internals/tensor_checks.md
@@ -0,0 +1,387 @@
+# Tensor Checks (Host-Side Auto-Validation)
+
+This page explains the host-side checks that TileLang automatically inserts into the generated host stub for kernels. When you pass `torch.Tensor` or any DLPack-compatible object to a TileLang kernel, the host stub validates argument count, pointer kinds, dtype, shape, strides, device, and more — so you don’t need to handwrite Python checks. This keeps the ABI stable and significantly reduces Python overhead compared to doing equivalent checks in Python or via pybind.
+
+## Why Host-Side Checks
+- ABI stability: the entry is based on TVM FFI + DLPack, consistently accepting tensors and scalars.
+- Lower overhead: shifting checks from Python into C reduces interpreter/property-access costs; the call overhead is lower than pybind-based approaches.
+- Focused error reporting: assertions are raised close to the call site with precise “which field failed” messages.
+
+## How To Inspect Host Source
+You can inspect the auto-generated host source (with all checks and the final device-kernel call) for debugging:
+
+```python
+print(matmul_relu_kernel.get_host_source())
+```
+
+---
+
+## What The Host Checks
+
+### 1) Argument count and pointer kind
+- `num_args` must match the number of formal parameters; otherwise the kernel returns `-1` with an error message.
+- Each argument’s FFI type must be a pointer kind (for DLTensor/handle) or a valid scalar type; otherwise you’ll see errors like `Expect arg[i] to be pointer` or a scalar type error.
+
+### 2) Tensor checks (per tensor, after nullability decision)
+- Nullability
+  - If the tensor is “statically reachable/used” by the function body, the handle must be non-NULL; otherwise: `xxx is expected to have non-NULL pointer`.
+  - If an input tensor is not used by the function (statically unreachable), NULL is allowed; other field checks are executed only when `handle != NULL`.
+- Rank (`ndim`)
+  - Runtime `ndim` must equal the compile-time rank.
+- Data type (`dtype`)
+  - Match the triple `(code, bits, lanes)` with tolerance:
+    - `float8_e4m3`: accept `e4m3`, `e4m3fn`, `e4m3fnuz`.
+    - `float8_e5m2`: accept `e5m2`, `e5m2fnuz`.
+    - `bool`: accept `int8/uint8` with `bits=8` (same lanes), `kDLBool(code=6, bits=1 or 8)`, and any `bitwidth=1` (lanes must match).
+  - For packed-bit dtypes (e.g., `Int(1)`, `Int(4)`, `UInt(4)`), strict dtype checking is skipped.
+- Shape
+  - Each runtime dimension is bound to the compile-time shape (constants or symbols) and checked for consistency.
+  - Linear equations among symbolic dims can be solved on the fly (when there’s only one unknown at a given check point), enabling cross-tensor constraints.
+- Strides
+  - If `buffer_type = AutoBroadcast`: allow `strides == NULL` and derive strides from `shape`. If explicit `strides` is present, bind to compile-time constraints and check for equality.
+  - Otherwise: check per-dimension; if `strides == NULL`, derive from `shape` and compare (e.g., contiguous: `strides[-1] == 1`, `strides[-2] == shape[-1]`).
+- `byte_offset`
+  - Must be 0 (non-zero raises an error) to keep addressing simple and aligned.
+- Device info
+  - Assert `device_type == target backend` (CUDA/ROCM/Metal/OneAPI/WebGPU/CPU, etc.). Error messages include a DLPack code legend.
+  - When multiple tensors participate, assert that `device_id` matches across them.
+- Data pointer
+  - Must be non-NULL when the tensor is required to be non-null by the nullability rule.
+
+### 3) Scalar checks
+- `T.int*` family: require integer; error: `Expect arg[i] to be int`.
+- `T.bool`: require boolean; error: `Expect arg[i] to be boolean`.
+
+---
+
+## Shapes and Symbolic Equations: Linear Solving
+When shapes are symbolic, the host binds and (when possible) solves linear relations at runtime (only one unknown per check point). Example:
+
+```python
+@T.prim_func
+def main(
+    A: T.Tensor((m,), dtype),
+    B: T.Tensor((m + n,), dtype),
+    C: T.Tensor((n * k,), dtype),
+):
+    ...
+```
+
+This enables enforcing cross-tensor relationships like `len(B) == m + n` and `len(C) == n * k` at runtime.
+
+---
+
+## Nullability Rules and Examples
+Which tensors may be NULL?
+
+- Rule: If an input tensor is not used by the function under static analysis (i.e., the access is statically unreachable), it is considered Nullable; otherwise it must be non-NULL.
+- Examples:
+
+1) Must be non-NULL (used)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    A[0] = 1
+```
+Passing `None` raises: `main.A_handle is expected to have non-NULL pointer`.
+
+2) Still must be non-NULL (constant-true branch)
+```python
+some_cond: bool = True
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+3) Nullable (constant-false branch, statically unreachable)
+```python
+some_cond: bool = False
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+4) Must be non-NULL (runtime condition)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype), some_cond: T.bool):
+    if some_cond:
+        A[0] = 1
+```
+Since `some_cond` is only known at runtime, static analysis cannot prove `A` is unused; `A` is thus non-nullable.
+
+---
+
+## Device Type Codes (DLPack)
+Supported and referenced device codes in error messages: `1=CPU, 2=CUDA, 7=Vulkan, 8=Metal, 10=ROCM, 14=OneAPI, 15=WebGPU`.
+Kernels assert that `device_type` matches the target backend, and require `device_id` consistency across tensors.
+
+---
+
+## Common Error Examples (What you’ll see)
+- Argument count mismatch (num_args)
+  - Trigger: missing/extra argument
+  - Error: `<kernel>: num_args should be N; expected: <num_args>, got: N`
+
+- Pointer-typed argument expected
+  - Trigger: scalar passed where a tensor is expected
+  - Error: `<kernel>: Expect arg[i] to be pointer`
+
+- Rank (ndim) mismatch
+  - Trigger: runtime rank differs from compile-time rank
+  - Error: `<kernel>.<name>.ndim is expected to equal R, but got mismatched ndim`
+
+- Dtype mismatch
+  - Trigger: dtype not equal to the compiled dtype and not within the tolerance set
+  - Error: `<kernel>.<name>.dtype is expected to be <dtype>, but got incompatible dtype`
+
+- Shape constraint violation
+  - Trigger: a dimension doesn’t match a constant/symbol binding
+  - Error: `Argument <kernel>.<name>.shape[i] has an unsatisfied constraint: ... == <expected>`
+
+- Strides check failed (e.g., non-contiguous layout)
+  - Trigger: transposed/sliced tensors that violate expected strides
+  - Error: `Argument <kernel>.<name>.strides[j] has an unsatisfied constraint: ... == <expected>`
+
+- Device type mismatch
+  - Trigger: calling a CUDA kernel with CPU tensors, etc.
+  - Error: `<kernel>.<name>.device_type mismatch [expected: <code> (<name>)] ...`
+
+- Device id mismatch
+  - Trigger: mixing tensors from different GPUs
+  - Error: `Argument <kernel>.<name>.device_id has an unsatisfied constraint: ... == ...`
+
+- NULL data pointer
+  - Trigger: tensor required to be non-null has a NULL data pointer
+  - Error: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`
+
+- Scalar type mismatch
+  - Trigger: passing float to `T.int32`, or non-boolean to `T.bool`
+  - Error: `<kernel>: Expect arg[i] to be int/boolean`
+
+---
+
+## Troubleshooting Tips
+- Print the host source: `print(fn.get_host_source())` to see the exact assertion and expected vs. actual fields.
+- Fix strides: call `.contiguous()` for non-contiguous tensors, or avoid generating transposed/sliced layouts that break assumptions.
+- Align devices: ensure all participating tensors share the same `device_type` and `device_id`.
+- Align dtype: use `.to(<dtype>)` or construct tensors with the correct dtype; pay attention to `float8` and `bool` tolerance.
+- Dynamic shapes: ensure cross-tensor linear relations can be uniquely determined at the check point (only one unknown at a time).
+
+---
+
+## FAQ
+- Can I disable the checks?
+  - Not recommended and usually not supported. Checks are done on the host to preserve ABI stability and fail early close to the device call.
+- Is the overhead noticeable?
+  - The checks are lightweight (branches and field reads). Compared to Python-side checks, it’s faster; the dominating cost remains the Python→C boundary. Overall it’s cheaper than equivalent checks in Python.
+
+---
+
+## Reference Example (Matmul + ReLU)
+
+```python
+@T.prim_func
+def matmul_relu_kernel(
+    A: T.Tensor((M, K), dtype),
+    B: T.Tensor((K, N), dtype),
+    C: T.Tensor((M, N), dtype),
+):
+    # Initialize Kernel Context
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((block_M, block_K), dtype)
+        B_shared = T.alloc_shared((block_K, block_N), dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        T.clear(C_local)
+        for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+            T.copy(A[by * block_M, ko * block_K], A_shared)
+            T.copy(B[ko * block_K, bx * block_N], B_shared)
+            T.gemm(A_shared, B_shared, C_local)
+        T.copy(C_local, C[by * block_M, bx * block_N])
+
+# For debugging, print the host source
+print(matmul_relu_kernel.get_host_source())
+```
+
+The host will insert all checks described above for this example.
+
+---
+
+## Quick Error Reference (Short List)
+- Argument count
+  - Trigger: missing/extra args; Error: `num_args should be N; expected: <num_args>, got: N`.
+- Pointer kind
+  - Trigger: scalar passed to tensor arg; Error: `Expect arg[i] to be pointer`.
+- Rank (ndim)
+  - Trigger: runtime rank != compile-time; Error: `ndim ... expected to equal R`.
+- Dtype
+  - Trigger: mismatch and not tolerated; Error: `dtype ... expected to be <dtype>`.
+- Shape
+  - Trigger: constant/symbol binding violated; Error: `shape[i] ... == <expected>`.
+- Strides
+  - Trigger: layout mismatch; Error: `strides[j] ... == <expected>`.
+- Device type
+  - Trigger: wrong backend device; Error: `device_type mismatch [expected: ...]`.
+- Device id
+  - Trigger: tensors on different GPUs; Error: `device_id ... == ...`.
+- Data pointer
+  - Trigger: required non-NULL but NULL; Error: `non-NULL data pointer`.
+- Scalar types
+  - Trigger: wrong scalar type; Error: `Expect arg[i] to be int/boolean`.
+
+---
+
+## Host Error Troubleshooting (Minimal Repros)
+
+Below are minimal repro snippets for common host-side errors, assuming a CUDA-targeted kernel like `matmul_relu_kernel` with:
+
+```python
+# Convention:
+# A: float16 [M, K]
+# B: float16 [K, N]
+# C: float16 [M, N]
+# Target: CUDA (device_type=2)
+fn = matmul_relu_kernel  # your compiled function
+M = N = K = 1024
+```
+
+Adjust dtype/device if your kernel differs.
+
+### 0. Tip: print the host source
+```python
+print(fn.get_host_source())
+```
+
+### 1. num_args mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+# Missing C
+fn(A, B)
+```
+Expected: `<kernel>: num_args should be 3; expected: <num_args>, got: 3`.
+
+Fix: pass all arguments per the signature.
+
+### 2. Expect pointer (tensor) but got scalar
+```python
+import torch
+
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(1, B, C)
+```
+Expected: `<kernel>: Expect arg[0] to be pointer`.
+
+Fix: pass a DLPack-compatible tensor (e.g., torch.Tensor).
+
+### 3. ndim mismatch
+```python
+import torch
+
+A = torch.empty((M, K, 1), device='cuda', dtype=torch.float16)  # rank=3
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.ndim is expected to equal 2, but got mismatched ndim`.
+
+Fix: ensure runtime rank equals compiled rank.
+
+### 4. dtype mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float32)  # should be float16
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.dtype is expected to be float16, but got incompatible dtype`.
+
+Fix: `A = A.to(torch.float16)` or create with the correct dtype.
+
+### 5. Shape constant/symbol mismatch
+```python
+import torch
+
+A = torch.empty((M, K + 1), device='cuda', dtype=torch.float16)  # K mismatched
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.A_handle.shape[i] has an unsatisfied constraint: ... == <expected>`.
+
+Fix: satisfy linear constraints and constants across tensors.
+
+### 6. Strides check failure (non-contiguous)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+A_nc = A.t()  # transpose -> non-contiguous
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A_nc, B, C)
+```
+Expected: `Argument <kernel>.A_handle.strides[1] has an unsatisfied constraint: ... == 1`.
+
+Fix: pass `A_nc.contiguous()` or align the layout expectation in the kernel.
+
+### 7. device_type mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cpu', dtype=torch.float16)
+B = torch.empty((K, N), device='cpu', dtype=torch.float16)
+C = torch.empty((M, N), device='cpu', dtype=torch.float16)
+fn(A, B, C)  # CUDA-targeted kernel
+```
+Expected: `<kernel>.A_handle.device_type mismatch [expected: 2 (cuda)] ...`.
+
+Fix: move tensors to the CUDA device.
+
+### 8. device_id mismatch (multi-GPU)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda:0', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda:1', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda:0', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.B_handle.device_id has an unsatisfied constraint: ... == ...`.
+
+Fix: place all tensors on the same GPU (e.g., `cuda:0`).
+
+### 9. NULL data pointer (advanced)
+This usually comes from hand-constructed DLTensor/NDArray, or external frameworks passing unallocated/freed storage. Regular `torch.Tensor` allocations rarely hit this.
+
+Expected: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`.
+
+Fix: ensure valid underlying storage; in PyTorch scenarios, avoid constructing tensors from invalid external handles.
+
+### 10. Scalar type mismatch (int / bool)
+```python
+import tilelang.language as T
+
+@T.prim_func
+def scalar_check(x: T.int32, flag: T.bool()):
+    T.evaluate(0)
+
+scalar_check(1.0, True)  # x is float -> Expect arg[0] to be int
+scalar_check(1, 2.5)     # flag is float -> Expect arg[1] to be boolean
+```
+
+Fix: pass correct scalar types, e.g., `scalar_check(1, True)`.
+
+---
+
+## Closing Notes
+- Cross-check “shape / strides / device / dtype” against the kernel signature to localize issues efficiently.
+- For complex symbolic relations, print the host source to confirm binding/solving order, then adjust runtime shapes/layouts accordingly.
+
diff --git a/docs/conf.py b/docs/conf.py
index 1b128903852215b98dbf68f146b4f8ab89085c32..877b5582e1e28ee75704d5c75a8ff900a61c4cd3 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,5 @@
 # General information about the project.
-project = "Tile Language <br>"
+project = "TileLang <br>"
 author = "Tile Lang Contributors"
 copyright = f"2025-2025, {author}"
 
@@ -20,33 +20,27 @@ extensions = [
     "autoapi.extension",
 ]
 
-autoapi_type = 'python'
-autoapi_dirs = ['../tilelang']
+autoapi_type = "python"
+autoapi_dirs = ["../tilelang"]
 
 autoapi_options = [
-    'members',
-    'undoc-members',
-    'show-inheritance',
-    'show-module-summary',
-    'special-members',
+    "members",
+    "undoc-members",
+    "show-inheritance",
+    "show-module-summary",
+    "special-members",
 ]
 autoapi_keep_files = False  # Useful for debugging the generated rst files
 
 autoapi_generate_api_docs = True
 
-autodoc_typehints = 'description'
+autodoc_typehints = "description"
 
 autoapi_ignore = ["*language/ast*", "*version*", "*libinfo*", "*parser*"]
 
-source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
-}
+source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
 
-myst_enable_extensions = [
-    "colon_fence",
-    "deflist",
-]
+myst_enable_extensions = ["colon_fence", "deflist"]
 
 redirects = {"get_started/try_out": "../index.html#getting-started"}
 
@@ -62,13 +56,11 @@ todo_include_todos = False
 html_theme = "furo"
 templates_path = []
 html_static_path = ["_static"]
-footer_copyright = "© 2025-2025 Tile Language"
+html_css_files = ["custom.css"]
+footer_copyright = "© 2025-2026 TileLang"
 footer_note = " "
 
-html_theme_options = {
-    "light_logo": "img/logo-row.svg",
-    "dark_logo": "img/logo-row.svg",
-}
+html_theme_options = {"light_logo": "img/logo-v2.png", "dark_logo": "img/logo-v2.png"}
 
 header_links = [
     ("Home", "https://github.com/tile-ai/tilelang"),
diff --git a/docs/deeplearning_operators/elementwise.md b/docs/deeplearning_operators/elementwise.md
index 5e1243c26827e31ab89cfb122af664899b4ddf7a..f3543c02f5ed4a7d95708de488dba0309ca7bf93 100644
--- a/docs/deeplearning_operators/elementwise.md
+++ b/docs/deeplearning_operators/elementwise.md
@@ -24,7 +24,7 @@ Please note that this tutorial does not delve deeply into the design principles
 ## Elementwise add in TileLang
 
 ```python
-def elementwise_add(N, threads=256, dtype="bfloat16"):
+def elementwise_add(N, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -43,7 +43,7 @@ Those familiar with CUDA programming might wonder where `threadIdx` fits into th
 The program can be compiled using the following code:
 
 ```python
-program = elementwise_add(1024, threads=256, dtype="bfloat16")
+program = elementwise_add(1024, threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 Launching the kernel is straightforward, just call it directly like a function:
@@ -89,7 +89,7 @@ def elementwise_add(
 In the compilation process above, a fixed shape was used. However, in practical usage, we often want the kernel to support dynamic shapes. So, how can we compile a kernel in TileLang to handle dynamic shapes? In TileLang, we can replace the target size with a dynamic symbolic value, making the dimension dynamic. The following example illustrates this:
 
 ```python
-program = elementwise_add(T.dynamic("N"), threads=256, dtype="bfloat16")
+program = elementwise_add(T.dynamic("N"), threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 
@@ -102,7 +102,7 @@ TileLang automatically incorporates boundary-checking conditions; however, this
 When compiling the example below, let's set `N` to 2047:
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -176,7 +176,7 @@ While TileLang incorporates various optimizations for the aforementioned case, i
 In such scenarios, explicitly specifying the number of elements computed per thread can help "guide" TileLang's code generation process, leading to implementations that are more closely aligned with the intended design.
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -212,7 +212,7 @@ Aha, this CUDA code aligns closely with conventional programming practices, maki
 But what happens if we provide additional hints to TileLang? For instance, by explicitly specifying register copies using the `T.copy(...)` operation. The example below demonstrates a vector addition implementation. Unlike the previous examples, this code explicitly loads data into registers before performing computations.
 
 ```python
-def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
diff --git a/docs/deeplearning_operators/matmul_sparse.md b/docs/deeplearning_operators/matmul_sparse.md
new file mode 100644
index 0000000000000000000000000000000000000000..5910bd6f8c25943ee18bbd65b7ed7fa0b060de5a
--- /dev/null
+++ b/docs/deeplearning_operators/matmul_sparse.md
@@ -0,0 +1,262 @@
+# Sparse Matrix-Matrix Multiplication with Tile Library
+
+<div style="text-align: left;">
+    <em>Author:</em> <a href="https://github.com/botbw">botbw</a>
+</div>
+
+:::{warning}
+   This document is still **experimental** and may be incomplete.  
+
+   This feature is still **experimental** and need further optimization.
+
+   Suggestions and improvements are highly encouraged—please submit a PR!
+:::
+
+:::{tip}
+It's suggested to go through `docs/deeplearning_operators/matmul.md` first.
+
+Example code can be found at `examples/gemm_sp`.
+:::
+
+## Structured sparsity in the NVIDIA Ampere architecture
+
+Since the Ampere architecture (sm80 and above), sparsity support has been integrated into Tensor Cores. This allows a 2:4 (or 1:2 for 32-bit data types) semi-structured matrix to be compressed into its non-zero values along with associated metadata, which can then be fed into the Tensor Core. This enables up to **2x throughput** compared to the equivalent dense computation.
+
+:::{warning}
+   This tutorial primarily focuses on CUDA, as this feature is not yet supported on ROCm. However, AMD provides a similar capability in the matrix cores of GPUs such as the MI300X.
+:::
+
+```{figure} ../_static/img/sparse_mma_storage_example.png
+:align: center
+
+Figure: Sparse MMA storage example (from PTX doc)
+```
+
+## Compress a dense tensor
+
+To utilize sparse Tensor Cores, a dense tensor must first be **compressed** into its non-zero values along with the corresponding metadata.
+
+Both `PyTorch` and `vLLM` use `CUTLASS` as their computation backend (see references [here](https://github.com/pytorch/pytorch/blob/a8d6afb511a69687bbb2b7e88a3cf67917e1697e/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu#L47) and [here](https://github.com/vllm-project/vllm/blob/a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh#L116)), leveraging `CUTLASS`’s built-in compressor (or reimplementing it in `PyTorch`).
+
+A set of **CUTLASS-compatible** compressors is provided in `tilelang.utils.sparse`, where a dense tensor—along with other required arguments (e.g., block_K for sm90, transpose options)—can be passed in to perform the compression.
+
+```python 
+from tilelang.utils.sparse import compress
+A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
+```
+
+Here, `A_sparse` contains all the non-zero elements of `A`, while `E` stores the corresponding metadata (indexing information) required to reconstruct the original sparse pattern.
+
+> NOTE: When using CUTLASS compressor, there is no naive position correspondence between the positions in `A_sparse`/`A` and `E`. (i.e. the 4-element group at [n, k] doesn't match the 4-bit metadata at [n, k] if you consider metadata as int4 tensor)
+The metadata is reordered internally to optimize memory access patterns (e.g., for ldsm instructions and vectorized loads).
+For more information, see **A note on `gemm_sp` and `gemm_sp_v2`**.
+
+
+## `T.gemm_sp` with CUTLASS's compressor
+
+:::{warning}
+
+It is strongly recommended to use T.gemm_sp_v2 due to its greater flexibility and faster compilation time.
+
+:::
+
+A 2:4 sparse GEMM kernel is similar to its dense counterpart, except that it also requires handling the associated metadata.
+
+Check comments in below kernel code for required modification.
+
+```python
+def matmul_sp_sm80(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+    trans_A,
+    trans_B,
+):
+    is_8_bit = "8" in in_dtype
+    metadata_dtype = 'int32' if is_8_bit else 'int16'
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]  # Calculate shape for given datatypes
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (K, N) if not trans_B else (N, K)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)  # Allocate smem for metadata
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({  # Annotate reordered cutlass metadata layout
+                E:
+                    make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared:
+                    make_cutlass_metadata_layout(
+                        E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)  # Call gemm_sp with non-zero values and metadata
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+```
+
+Under the hood, `gemm_sp` invokes templates adapted from `CUTLASS`, and a compatible metadata layout must be specified using `T.annotate_layout`.
+
+## `T.gemm_sp_v2` with a custom compressor
+
+To migrate to `gemm_sp_v2`, simply replace occurrences of `gemm_sp`.
+
+Unlike `gemm_sp`, `gemm_sp_v2` can operate without `T.annotate_layout`, and it also supports user-defined layouts and compressors.
+
+The metadata is stored in a `(u)int8`/`(u)int16`/`(u)int32` tensor, where **each 4-bit chunk represents two 2-bit indices** of non-zero elements within four consecutive elements. Here, we start with an `int16` example, which is the **default dtype** for `bf16` and `fp16` on Ampere GPUs.
+
+Suppose we have the following row vector:
+```python
+t = tensor([[0, 7, 0, 3], [1, 5, 0, 0], [0, 0, 2, 4], [9, 0, 9, 0]], dtype=torch.float16).flatten()
+```
+
+The non-zero elements and their corresponding indices are:
+
+```python
+t_sp = tensor([[7, 3], [1, 5], [2, 4], [9, 9]], dtype=torch.float16).flatten()
+indices = tensor([[1, 3], [0, 1], [2, 3], [0, 2]], dtype=torch.float16).flatten()
+```
+
+The corresponding uint16 metadata is:
+```python
+# metadata_bits = tensor([0b1101, 0b0100, 0b1110, 0b1000])
+# Note: storage uses little-endian order: tensor(0b1000111001001101, dtype=torch.int16)
+# Note: the above code is not runnable in python as the interpreter won't take the binary
+#       as 2's complement
+metadata_int16 = tensor(-29107)
+```
+
+You can decode an int16 metadata tensor using the following utility:
+```python
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+```
+
+The compressor can be implement at either `PyTorch`/`NumPy` level or kernel level.
+
+For example, `PyTorch` provides an Ampere compressor [here](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L47-L179). Note that in this implementation, a [permutation](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L173-L175) is applied to match CUTLASS’s metadata layout. If you do not annotate a metadata layout when using `gemm_sp_v2`, your compressor should replicate the same behavior as the PyTorch example—but without using the `_calculate_meta_reordering_scatter_offsets` function.
+
+If you want to use a custom metadata layout in your kernel, one approach is to define the layout in `TileLang` and then apply the same layout to both your compressor kernel and the matmul_sp kernel.
+
+```python
+
+@tilelang.jit(out_idx=[1, 2], pass_configs={
+    tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+})
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:  # NOTE: Make sure compressor metadata layout
+                T.annotate_layout({ # is same with your computation kernel
+                    E:
+                        make_cutlass_metadata_layout(
+                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    E_shared:
+                        make_cutlass_metadata_layout(
+                            E_shared,
+                            mma_dtype="float16",
+                            arch="8.0",
+                            block_k=block_K),
+                })
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            non_zero_cnt = T.alloc_local((1, ), dtype="uint8")
+            non_zero_elt_log_idx = T.alloc_local((elem, ), dtype="uint8")
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    T.clear(non_zero_cnt)
+                    T.clear(non_zero_elt_log_idx)
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+```
+
+## A note on `gemm_sp` and `gemm_sp_v2`
+
+Initially, `T.gemm_sp` followed the same design as `T.gemm`, lowering to a `CUTLASS` template. This inherently requires metadata to be reordered offline following a predetermined layout.
+
+However, fixing a specific layout introduces several potential issues:
+
+1. Painful debugging experience: Debugging a failed kernel becomes difficult due to the reordered indexing, including permutations and swizzling.
+
+2. Limited flexibility: For example, concatenating two compressed tensors, such as `A_sparse_0` and `A_sparse_1`, into a new `A_sparse` makes sense. However, concatenating their metadata `E_0` and `E_1` may not be valid unless the layout allows it mathematically.
+
+3. Alignment requirements: `CUTLASS` enforces strict alignment checks, and many hyperparameter configurations can lead to compilation errors. (For reference, sm8x was implemented in `CUTLASS 2`.)
+
+`T.gemm_sp_v2` was designed to address these limitations, following the approach of `T.gemm_v2`. It lowers directly to PTX, removing the need for a fixed metadata layout.
\ No newline at end of file
diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md
index 3d5c6db9d52af6e95c0593a073a8cee9366c1130..8fa41c023ad82fcf6004b230c9b556f87aaa32a4 100644
--- a/docs/get_started/Installation.md
+++ b/docs/get_started/Installation.md
@@ -8,25 +8,25 @@
 - **Python Version**: >= 3.8
 - **CUDA Version**: 12.0 <= CUDA < 13
 
-The easiest way to install **tile-lang** is directly from PyPI using pip. To install the latest version, run the following command in your terminal:
+The easiest way to install tilelang is directly from PyPI using pip. To install the latest version, run the following command in your terminal:
 
 ```bash
 pip install tilelang
 ```
 
-Alternatively, you may choose to install **tile-lang** using prebuilt packages available on the Release Page:
+Alternatively, you may choose to install tilelang using prebuilt packages available on the Release Page:
 
 ```bash
 pip install tilelang-0.0.0.dev0+ubuntu.20.4.cu120-py3-none-any.whl
 ```
 
-To install the latest version of **tile-lang** from the GitHub repository, you can run the following command:
+To install the latest version of tilelang from the GitHub repository, you can run the following command:
 
 ```bash
 pip install git+https://github.com/tile-ai/tilelang.git
 ```
 
-After installing **tile-lang**, you can verify the installation by running:
+After installing tilelang, you can verify the installation by running:
 
 ```bash
 python -c "import tilelang; print(tilelang.__version__)"
@@ -40,18 +40,18 @@ python -c "import tilelang; print(tilelang.__version__)"
 - **Python Version**: >= 3.8
 - **CUDA Version**: >= 10.0
 
-```bash
-docker run -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.01-py3
-```
+If you prefer Docker, please skip to the [Install Using Docker](#install-using-docker) section. This section focuses on building from source on a native Linux environment.
 
-To build and install **tile-lang** directly from source, follow these steps. This process requires certain pre-requisites from Apache TVM, which can be installed on Ubuntu/Debian-based systems using the following commands:
+First, install the OS-level prerequisites on Ubuntu/Debian-based systems using the following commands:
 
 ```bash
 apt-get update
 apt-get install -y python3 python3-dev python3-setuptools gcc zlib1g-dev build-essential cmake libedit-dev
 ```
 
-After installing the prerequisites, you can clone the **tile-lang** repository and install it using pip:
+Then, clone the tilelang repository and install it using pip. The `-v` flag enables verbose output during the build process.
+
+> **Note**: Use the `--recursive` flag to include necessary submodules. Tilelang currently depends on a customized version of TVM, which is included as a submodule. If you prefer [Building with Existing TVM Installation](#using-existing-tvm), you can skip cloning the TVM submodule (but still need other dependencies).
 
 ```bash
 git clone --recursive https://github.com/tile-ai/tilelang.git
@@ -59,13 +59,19 @@ cd tilelang
 pip install . -v
 ```
 
-If you want to install **tile-lang** in development mode, you can run the following command:
+If you want to install tilelang in development mode, you can use the `-e` flag so that any changes to the Python files will be reflected immediately without reinstallation.
 
 ```bash
 pip install -e . -v
 ```
 
-If you prefer to work directly from the source tree via `PYTHONPATH`, make sure the native extension is built first:
+> **Note**: changes to C++ files require rebuilding the tilelang C++ library. See [Faster Rebuild for Developers](#faster-rebuild-for-developers) below. A default `build` directory will be created if you use `pip install`, so you can also directly run `make` in the `build` directory to rebuild it as [Working from Source via PYTHONPATH](#working-from-source-via-pythonpath) suggested below.
+
+(working-from-source-via-pythonpath)=
+
+### Working from Source via `PYTHONPATH` (Recommended for Developers)
+
+If you prefer to work directly from the source tree via `PYTHONPATH` instead of using pip, make sure the native extension (`libtilelang.so`) is built first:
 
 ```bash
 mkdir -p build
@@ -73,6 +79,14 @@ cd build
 cmake .. -DUSE_CUDA=ON
 make -j
 ```
+
+We also recommend using `ninja` to speed up compilation:
+
+```bash
+cmake .. -DUSE_CUDA=ON -G Ninja
+ninja
+```
+
 Then add the repository root to `PYTHONPATH` before importing `tilelang`, for example:
 
 ```bash
@@ -85,17 +99,23 @@ Some useful CMake options you can toggle while configuring:
 - `-DUSE_ROCM=ON` selects ROCm support when building on AMD GPUs.
 - `-DNO_VERSION_LABEL=ON` disables the backend/git suffix in `tilelang.__version__`.
 
-We currently provide four methods to install **tile-lang**:
+(using-existing-tvm)=
+
+### Building with Customized TVM Path
+
+If you already have a TVM codebase, use the `TVM_ROOT` environment variable to specify the location of your existing TVM repository when building tilelang:
+
+```bash
+TVM_ROOT=<your-tvm-repo> pip install . -v
+```
 
-1. [Install Using Docker](#install-method-1) (Recommended)
-2. [Install from Source (using the bundled TVM submodule)](#install-method-2)
-3. [Install from Source (using your own TVM installation)](#install-method-3)
+> **Note**: This will still rebuild the TVM-related libraries (stored in `TL_LIBS`). And this method often leads to some path issues. Check `env.py` to see some environment variables which are not set properly.
 
-(install-method-1)=
+(install-using-docker)=
 
-### Method 1: Install Using Docker (Recommended)
+## Install Using Docker
 
-For users who prefer a containerized environment with all dependencies pre-configured, **tile-lang** provides Docker images for different CUDA versions. This method is particularly useful for ensuring consistent environments across different systems and is the **recommended approach** for most users.
+For users who prefer a containerized environment with all dependencies pre-configured, tilelang provides Docker images for different CUDA versions. This method is particularly useful for ensuring consistent environments across different systems.
 
 **Prerequisites:**
 - Docker installed on your system
@@ -142,82 +162,17 @@ docker run -itd \
 - `--name tilelang_b200`: Assigns a name to the container for easy management
 - `/bin/zsh`: Uses zsh as the default shell
 
-4. **Access the Container**:
+4. **Access the Container and Verify Installation**:
 
 ```bash
 docker exec -it tilelang_b200 /bin/zsh
-```
-
-5. **Verify Installation**:
-
-Once inside the container, verify that **tile-lang** is working correctly:
-
-```bash
+# Inside the container:
 python -c "import tilelang; print(tilelang.__version__)"
 ```
 
-You can now run TileLang examples and develop your applications within the containerized environment. The Docker image comes with all necessary dependencies pre-installed, including CUDA toolkit, TVM, and TileLang itself.
-
-**Example Usage:**
-
-After accessing the container, you can run TileLang examples:
-
-```bash
-cd /home/tilelang/examples
-python elementwise/test_example_elementwise.py
-```
-
-This Docker-based installation method provides a complete, isolated environment that works seamlessly on systems with compatible NVIDIA GPUs like the B200, ensuring optimal performance for TileLang applications.
-
-(install-method-2)=
-
-### Method 2: Install from Source (Using the Bundled TVM Submodule)
-
-If you already have a compatible TVM installation, follow these steps:
-
-1. **Clone the Repository**:
-
-```bash
-git clone --recursive https://github.com/tile-ai/tilelang
-cd tilelang
-```
-
-**Note**: Use the `--recursive` flag to include necessary submodules.
-
-2. **Configure Build Options**:
-
-Create a build directory and specify your existing TVM path:
-
-```bash
-pip install . -v
-```
-
-(install-method-3)=
-
-### Method 3: Install from Source (Using Your Own TVM Installation)
-
-If you prefer to use the built-in TVM version, follow these instructions:
-
-1. **Clone the Repository**:
-
-```bash
-git clone --recursive https://github.com/tile-ai/tilelang
-cd tilelang
-```
-
-**Note**: Ensure the `--recursive` flag is included to fetch submodules.
-
-2. **Configure Build Options**:
-
-Copy the configuration file and enable the desired backends (e.g., LLVM and CUDA):
-
-```bash
-TVM_ROOT=<your-tvm-repo> pip install . -v
-```
-
 ## Install with Nightly Version
 
-For users who want access to the latest features and improvements before official releases, we provide nightly builds of **tile-lang**.
+For users who want access to the latest features and improvements before official releases, we provide nightly builds of tilelang.
 
 ```bash
 pip install tilelang -f https://tile-ai.github.io/whl/nightly/cu121/
@@ -252,24 +207,28 @@ Set `NO_TOOLCHAIN_VERSION=ON` to disable this.
 
 ### Run-time environment variables
 
-<!-- TODO: tvm -->
+Please refer to the `env.py` file for a full list of supported run-time environment variables.
+
+## Other Tips
 
-## IDE Configs
+### IDE Configs
 
-Building tilelang locally will automatically `compile_commands.json` file in `build` dir.
+Building tilelang locally will automatically generate a `compile_commands.json` file in `build` dir.
 VSCode with clangd and [clangd extension](https://marketplace.visualstudio.com/items?itemName=llvm-vs-code-extensions.vscode-clangd) should be able to index that without extra configuration.
 
-## Compile cache
+### Compile Cache
 
-`ccache` will be automatically used if found.
+The default path of the compile cache is `~/.tilelang/cache`. `ccache` will be automatically used if found.
 
-## Repairing wheels
+### Repairing Wheels
 
 If you plan to use your wheel in other environment,
-it's recommend to use auditwheel (on Linux) or delocate (on Darwin)
+it's recommended to use auditwheel (on Linux) or delocate (on Darwin)
 to repair them.
 
-## Faster rebuild for developers
+(faster-rebuild-for-developers)=
+
+### Faster Rebuild for Developers
 
 `pip install` introduces extra [un]packaging and takes ~30 sec to complete,
 even if no source change.
@@ -278,8 +237,17 @@ Developers who needs to recompile frequently could use:
 
 ```bash
 pip install -r requirements-dev.txt
+
+# For first time compilation
 pip install -e . -v --no-build-isolation
 
+# Or manually compile with cmake/ninja. Remember to set PYTHONPATH properly.
+mkdir build
+cd build
+cmake .. -G Ninja
+ninja
+
+# Rebuild when you change the cpp code
 cd build; ninja
 ```
 
diff --git a/docs/index.md b/docs/index.md
index 5d9a158f80861ee2255785a90748fc2a271ac026..55804259a46a857b3589919e0536e565950ffa2d 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,10 +2,10 @@
 
 [GitHub](https://github.com/tile-ai/tilelang)
 
-Tile Language (tile-lang) is a concise domain-specific language designed to streamline 
-the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention). 
-By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM, 
-tile-lang allows developers to focus on productivity without sacrificing the 
+Tile Language (tile-lang) is a concise domain-specific language designed to streamline
+the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention).
+By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM,
+tile-lang allows developers to focus on productivity without sacrificing the
 low-level optimizations necessary for state-of-the-art performance.
 
 :::{toctree}
@@ -24,6 +24,19 @@ get_started/targets
 
 tutorials/debug_tools_for_tilelang
 tutorials/auto_tuning
+tutorials/logging
+:::
+
+:::{toctree}
+:maxdepth: 1
+:caption: PROGRAMMING GUIDES
+
+programming_guides/overview
+programming_guides/language_basics
+programming_guides/instructions
+programming_guides/control_flow
+programming_guides/autotuning
+programming_guides/type_system
 :::
 
 :::{toctree}
@@ -33,6 +46,7 @@ tutorials/auto_tuning
 deeplearning_operators/elementwise
 deeplearning_operators/gemv
 deeplearning_operators/matmul
+deeplearning_operators/matmul_sparse
 deeplearning_operators/deepseek_mla
 :::
 
@@ -42,6 +56,7 @@ deeplearning_operators/deepseek_mla
 
 compiler_internals/letstmt_inline
 compiler_internals/inject_fence_proxy
+compiler_internals/tensor_checks
 :::
 
 :::{toctree}
diff --git a/docs/programming_guides/autotuning.md b/docs/programming_guides/autotuning.md
new file mode 100644
index 0000000000000000000000000000000000000000..66d46889fe88e4b1de874d552e2f1c922c534660
--- /dev/null
+++ b/docs/programming_guides/autotuning.md
@@ -0,0 +1,308 @@
+# Autotuning
+
+TileLang includes a built‑in autotuner that searches configuration spaces
+for the best performing kernel, compiles candidates in parallel, validates
+correctness, benchmarks them, and caches the best result for reuse.
+
+This guide covers two workflows:
+- Decorator‑based: `@tilelang.autotune(configs=...)` stacked on `@tilelang.jit`
+- Programmatic: `AutoTuner.from_kernel(...).set_*().run()`
+
+It also explains input tensor supply, validation, caching, and environment
+variables that affect parallelism and cache behavior.
+
+## 1) Decorator‑based Autotune
+
+Use `@tilelang.autotune` above `@tilelang.jit` and expose tunable parameters as
+function arguments with defaults. The autotuner overrides these parameters with
+values from your config space.
+
+```python
+import tilelang
+import tilelang.language as T
+
+def matmul_configs(M, N, K):
+    # Example space — tailor to your target
+    tiles = [64, 128]
+    stages = [2, 3]
+    threads = [128, 256]
+    return [
+        dict(block_M=BM, block_N=BN, block_K=BK, num_stages=S, threads=TH)
+        for BM in tiles
+        for BN in tiles
+        for BK in [32, 64]
+        for S in stages
+        for TH in threads
+    ]
+
+@tilelang.autotune(configs=matmul_configs, warmup=25, rep=100, timeout=60)
+@tilelang.jit(out_idx=[-1])
+def matmul(M: int, N: int, K: int,
+           block_M: int = 128, block_N: int = 128, block_K: int = 32,
+           threads: int = 128, num_stages: int = 3,
+           dtype: str = 'float16', accum_dtype: str = 'float32'):
+
+    @T.prim_func
+    def kernel(A: T.Tensor((M, K), dtype),
+               B: T.Tensor((K, N), dtype),
+               C: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_s = T.alloc_shared((block_M, block_K), dtype)
+            B_s = T.alloc_shared((block_K, block_N), dtype)
+            C_f = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_f)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, ko * block_K], A_s)
+                T.copy(B[ko * block_K, bx * block_N], B_s)
+                T.gemm(A_s, B_s, C_f)
+
+            T.copy(C_f, C[by * block_M, bx * block_N])
+
+    return kernel
+
+# Usage
+# Provide inputs via context (recommended for reproducibility across configs)
+import torch
+M = N = K = 1024
+A = torch.randn(M, K, device='cuda', dtype=torch.float16)
+B = torch.randn(K, N, device='cuda', dtype=torch.float16)
+C = torch.empty(M, N, device='cuda', dtype=torch.float16)
+
+from tilelang.autotuner import set_autotune_inputs
+with set_autotune_inputs(A, B, C):
+    tuned_kernel = matmul(M, N, K)   # compiles, tunes, returns best kernel
+    tuned_kernel(A, B, C)            # run best kernel
+```
+
+Notes
+- `configs` can be a list of dicts or a callable `(args...) -> list[dict]`. Each
+  dict’s keys must match the tunable function arguments (e.g., `block_M`).
+- The decorator returns a callable that runs autotune once per argument tuple
+  and caches the resulting best kernel in‑process.
+- For explicit input control during tuning, wrap the call with
+  `set_autotune_inputs(...)`. Otherwise, `supply_type` (below) is used.
+
+## 2) Programmatic Autotune
+
+Use the `AutoTuner` class to manage configs and arguments more explicitly.
+
+```python
+from tilelang.autotuner import AutoTuner
+
+kernel_factory = matmul  # the function above (already @tilelang.jit)
+tuner = AutoTuner.from_kernel(kernel_factory(M, N, K), configs=matmul_configs(M, N, K))
+
+tuner.set_profile_args(
+    warmup=25, rep=100, timeout=60,
+    supply_type=tilelang.TensorSupplyType.Auto,  # or provide supply_prog/ref_prog
+    ref_prog=lambda A, B, C: torch.allclose(C, (A @ B).to(C.dtype), rtol=1e-2, atol=1e-2),
+)
+
+tuner.set_compile_args(
+    target='auto',                  # or 'cuda'/'hip'/'metal'
+    execution_backend='auto',       # resolves per-target
+    out_idx=[-1],                   # which outputs to return if multiple
+    pass_configs={                  # optional TVM passes/flags
+        # tilelang.PassConfigKey.EXAMPLE_KEY: value,
+    },
+)
+
+artifact = tuner.run()             # compiles + runs + validates all configs
+best_kernel = artifact.kernel      # JITKernel
+best_latency = artifact.latency
+best_config = artifact.config
+
+# Reuse best kernel
+best_kernel(A, B, C)
+```
+
+### Example Gallery (in repo)
+- examples/gdn/example_chunk_delta_h.py:101 — uses `@autotune` to sweep configs
+- examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py:451 — uses `@tilelang.autotune`
+- examples/quickstart.py:84 — profiles a tuned kernel with `get_profiler`
+- examples/hadamard_transform/example_hadamard.py:152 — profiler with custom warmup
+- examples/dynamic_shape/example_dynamic.py:94 — profiler for dynamic shapes
+- examples/gemm/example_gemm_persistent.py:135 — compare persistent vs non‑persistent
+
+Click any path to open the code and compare patterns.
+
+## Input Tensor Supply
+
+The tuner needs inputs to compile and benchmark kernels. Provide them in one of
+three ways (priority order):
+
+1) Context manager (fixed inputs across configs)
+```python
+with set_autotune_inputs(A, B, C):
+    tuned = matmul(M, N, K)
+```
+
+2) Custom supplier program
+```python
+def supply_prog(signature):
+    # signature holds KernelParam objects describing shapes/dtypes
+    # Return a list of torch tensors matching the kernel’s arguments
+    return [A, B, C]
+
+tuner.set_profile_args(supply_prog=supply_prog)
+```
+
+3) Built‑in generators via `supply_type`
+- `TensorSupplyType.Auto` (default): heuristic per dtype (uniform ints / fp ranges)
+- `Integer`, `Uniform`, `Normal`, `Randn`, `Zero`, `One`
+
+Important
+- Built‑in generators require static shapes; if your PrimFunc uses symbolic
+  dimensions (T.dyn), supply concrete inputs via (1) or (2).
+- Float8 dtypes require PyTorch 2.1+ for `torch.float8_*` support.
+
+## Correctness Checking and Tolerances
+
+Use one of the following validation methods:
+- `ref_prog`: Provide a reference program that receives the same inputs and
+  checks results. You can return a boolean or raise on mismatch.
+- `manual_check_prog`: A callable that inspects outputs and raises on mismatch.
+- `skip_check=True`: Skip correctness checks (faster, use with caution).
+
+Control numeric drift via:
+- `rtol` and `atol` (defaults 1e‑2)
+- `max_mismatched_ratio` (default 1%)
+
+## Configuration Spaces and Best Practices
+
+What to tune
+- Tile sizes: `block_M`, `block_N`, `block_K`
+- Software pipelining: `num_stages`
+- Threads per block: `threads` (or (x, y) tuple)
+- Optional: dtype variants, epilogues, small scheduling knobs
+
+Tips
+- Start from a working baseline. Tune a small, meaningful space first.
+- Respect hardware limits (shared memory bytes, registers per thread/block,
+  max threads per block). Eliminate impossible configs up‑front.
+- Keep block sizes multiples of vector widths and warp sizes when relevant.
+- Use `set_autotune_inputs` to ensure each config is measured on identical data.
+- Record your best configs and bake them as defaults when stable.
+
+## Parallel Compilation/Benchmarking and Timeouts
+
+The tuner compiles configurations in parallel using a thread pool and benchmarks
+them with a per‑config timeout. On CUDA, each worker sets the current device to
+avoid context issues.
+
+Notes
+- `timeout` uses POSIX signals; on non‑Unix systems, it may not take effect.
+- Logs are written to `autotuner.log` in the working directory.
+
+## Caching
+
+The autotuner caches best artifacts both in‑memory (per process) and on disk under
+`$TILELANG_CACHE_DIR/autotuner`. The cache key includes:
+- TileLang version, function source, closure free‑vars
+- Config list, compile args, profile args
+
+Disk cache contents (per key)
+- Best config and latency: `best_config.json`, `latency.json`
+- Kernel sources and library: `device_kernel.cu`, `host_kernel.cu`, `kernel_lib.so` (or `kernel.cubin`/`executable.so` depending on backend)
+- Function and params: `function.pkl`, `params.pkl`
+
+Control via env vars (tilelang.env)
+- `TILELANG_CACHE_DIR` (default `~/.tilelang/cache`)
+- `TILELANG_TMP_DIR` (default `$TILELANG_CACHE_DIR/tmp`)
+- Disable all kernel caches: `TILELANG_DISABLE_CACHE=1`
+- Disable autotune disk cache only: `TILELANG_AUTO_TUNING_DISABLE_CACHE=1`
+
+CPU worker control
+- `TILELANG_AUTO_TUNING_CPU_UTILITIES` (fraction, default 0.9)
+- `TILELANG_AUTO_TUNING_CPU_COUNTS` (int, `-1` auto)
+- `TILELANG_AUTO_TUNING_MAX_CPU_COUNT` (int, `-1` unlimited)
+
+Backend notes
+- NVRTC backend persists `.cubin` and a Python launcher.
+- Torch/DLPack backend may not save artifacts to disk; in this case, only
+  in‑memory caching applies and a warning is logged.
+
+## Alternative: Manual Sweeps with par_compile
+
+If you prefer manual control, use `JITImpl.par_compile` to compile a batch of
+configs and drive your own benchmarking:
+
+```python
+@tilelang.jit
+def factory(M, N, K, block_M=128, block_N=128, block_K=32):
+    @T.prim_func
+    def k(A: T.Tensor((M, K), 'float16'),
+           B: T.Tensor((K, N), 'float16'),
+           C: T.Tensor((M, N), 'float16')):
+        ...
+    return k
+
+impl = factory  # JITImpl
+cfgs = [
+    dict(block_M=64, block_N=128, block_K=32),
+    dict(block_M=128, block_N=128, block_K=64),
+]
+kernels = impl.par_compile(cfgs, num_workers=4)
+# Now benchmark kernels[i](A, B, C) yourself
+```
+
+## Recording and Reusing Best Configs
+
+The programmatic path returns an `AutotuneResult` that can be saved and later
+reloaded. This is useful for CI, multi‑host workflows, or shipping tuned configs.
+
+```python
+artifact = tuner.run()  # AutotuneResult
+
+# Save to disk
+from pathlib import Path
+save_dir = Path('out/best/matmul_1024')
+artifact.save_to_disk(save_dir, verbose=True)
+
+# Reload later
+from tilelang.autotuner.param import AutotuneResult, CompileArgs
+restored = AutotuneResult.load_from_disk(save_dir, CompileArgs())
+best = restored.kernel
+best(A, B, C)
+```
+
+Notes
+- DLPack/Torch execution backend may not persist compiled binaries; in that
+  case, re‑compilation is needed on load or use a different backend.
+- The directory contains human‑readable JSONs (best config/latency) and sources.
+
+## Advanced: Config Space Callables
+
+Derive config spaces from problem sizes to keep searches targeted and legal:
+
+```python
+def matmul_configs(M, N, K):
+    large = min(M, N, K) >= 1024
+    tiles = [128] if large else [64, 128]
+    for BM in tiles:
+        for BN in tiles:
+            for BK in [32, 64]:
+                for S in [2, 3]:
+                    for TH in [128, 256]:
+                        yield dict(block_M=BM, block_N=BN, block_K=BK,
+                                    num_stages=S, threads=TH)
+```
+
+## Device and Backend Selection
+
+Tune compile‑time options explicitly:
+- `target='auto'|'cuda'|'hip'|'metal'` (normalized to a TVM Target)
+- `execution_backend='auto'|'tvm_ffi'|'ctypes'|'cython'|'nvrtc'|'torch'`
+- `pass_configs={...}` to toggle TileLang/TVM passes for experiments
+
+On CUDA with multiple GPUs, the tuner sets the current device per worker thread
+to avoid context mixups.
+
+## Troubleshooting
+- “No configurations to tune”: Ensure `configs` is a non‑empty list or callable.
+- Timeouts: Increase `timeout`; ensure inputs fit device memory; verify that
+  your reference check isn’t the bottleneck.
+- Dynamic shapes: Provide concrete inputs via `set_autotune_inputs` or a custom
+  `supply_prog`.
+- Disk cache disabled: Check `TILELANG_AUTO_TUNING_DISABLE_CACHE` and backend.
diff --git a/docs/programming_guides/control_flow.md b/docs/programming_guides/control_flow.md
new file mode 100644
index 0000000000000000000000000000000000000000..158c51166e501a6628618e028ed0bbd904f7a47d
--- /dev/null
+++ b/docs/programming_guides/control_flow.md
@@ -0,0 +1,145 @@
+# Control Flow
+
+This guide covers the control‑flow primitives in TileLang and how they lower to
+efficient GPU code. You will use these to structure loops, handle boundaries,
+and express pipelined compute.
+
+## Overview
+- Conditionals: `if` / `elif` / `else`, ternary (`x if c else y`)
+- Loops: `T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`
+- While loops: `while` with a TIR condition
+- Flow control: Python `break` / `continue`
+- Safety: automatic OOB guards via the LegalizeSafeMemoryAccess pass
+
+The examples assume `import tilelang.language as T`.
+
+## Conditionals
+
+Standard Python `if`/`elif`/`else` is supported inside `@T.prim_func` kernels.
+Conditions should be TIR expressions (e.g., `i < N`). Python plain booleans are
+treated as compile‑time constants and will be folded.
+
+```python
+for i in T.serial(N):
+    if i < N:            # TIR condition
+        C[i] = A[i] + B[i]
+    else:
+        pass
+
+# Ternary
+x = (A[i] if i < N else 0)
+```
+
+Short‑circuit boolean ops are supported. For multi‑dimensional bounds, use
+`T.any_of` / `T.all_of` for clarity:
+
+```python
+if T.all_of(i < M, j < N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Boundary handling note
+- The LegalizeSafeMemoryAccess pass automatically inserts guards when an access
+  may be out‑of‑bounds, and elides them when proven safe. You can often omit
+  explicit `if` checks for simple edge handling, but keep them when you need
+  custom logic or clarity.
+
+## Loops
+
+### Serial
+
+`T.serial` creates a plain for‑loop. Common forms:
+
+```python
+for i in T.serial(N):
+    ...                     # 0..N-1
+
+for i in T.serial(0, N, 2):
+    ...                     # 0, 2, 4, ...
+```
+
+### Unroll
+
+`T.unroll` requests loop unrolling for small trip counts.
+
+```python
+for k in T.unroll(K_TILE):
+    acc += a[k] * b[k]
+```
+
+Advanced: TileLang forwards unroll hints to TIR; factor/explicit knobs are
+available for expert tuning.
+
+### Parallel (elementwise)
+
+`T.Parallel(ext0, ext1, ...)` builds nested loops that map well to elementwise
+operations. The body receives all indices in one `for` header:
+
+```python
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Optional: `coalesced_width=` can hint memory coalescing for the innermost loop.
+
+### Pipelined (software pipelining)
+
+`T.Pipelined(iters, num_stages=...)` overlaps producer/consumer stages (e.g.,
+Global→Shared copies with compute). This is the backbone of GEMM/attention
+pipelines.
+
+```python
+for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    T.copy(A[by * BM, ko * BK], A_s)  # stage: copy A tile
+    T.copy(B[ko * BK, bx * BN], B_s)  # stage: copy B tile
+    T.gemm(A_s, B_s, C_f)             # stage: compute
+```
+
+### Persistent (advanced)
+
+`T.Persistent(domain, wave_size, index, group_size=...)` exposes persistent
+thread‑block style looping. It is an advanced construct that TileLang lowers in
+later passes and is typically used by specialized templates.
+
+## While Loops
+
+`while` is supported when the condition is a TIR expression. Avoid infinite
+loops; TileLang will error if it detects a constant‑true condition.
+
+```python
+i = 0
+while i < N:
+    ...
+    if done:
+        break
+    i += 1
+```
+
+## Break and Continue
+
+Use Python `break`/`continue` to exit or skip within `T.serial`/`T.unroll`/
+`T.Parallel`/`while` loops. Keep the body clean after a `break`/`continue` for
+readability; the compiler will ignore the dead path.
+
+## Putting It Together: Residual Tile Handling
+
+Below is a typical edge pattern for a 2D kernel. With LegalizeSafeMemoryAccess,
+the explicit guard can be omitted when you don’t need a custom edge path.
+
+```python
+for i, j in T.Parallel(M, N):
+    gi = by * BM + i
+    gj = bx * BN + j
+    if T.all_of(gi < M, gj < N):     # optional in many cases
+        C[gi, gj] = A[gi, gj] + B[gi, gj]
+```
+
+## Debugging Conditions
+
+Use `T.print` to inspect values under predicates. For buffers, TileLang prints
+from a single thread to avoid duplicate outputs.
+
+```python
+if i == 0:
+    T.print(C, msg='C tile:')
+```
diff --git a/docs/programming_guides/instructions.md b/docs/programming_guides/instructions.md
new file mode 100644
index 0000000000000000000000000000000000000000..84bd9217990003044a97e6c59007486cae64566f
--- /dev/null
+++ b/docs/programming_guides/instructions.md
@@ -0,0 +1,182 @@
+# Instructions
+
+This page summarizes the core TileLang “instructions” available at the DSL
+level, how they map to hardware concepts, and how to use them correctly.
+
+## Quick Categories
+- Data movement: `T.copy`, `T.c2d_im2col`, staging Global ↔ Shared ↔ Fragment
+- Compute primitives: `T.gemm`/`T.gemm_sp`, elementwise math (`T.exp`, `T.max`),
+  reductions (`T.reduce_sum`, `T.cumsum`, warp reducers)
+- Control helpers: `T.clear`/`T.fill`, `T.reshape`/`T.view`
+- Diagnostics: `T.print`, `T.device_assert`
+- Advanced: atomics, memory barriers, warp‑group ops
+
+## Data Movement
+
+Use `T.copy(src, dst, coalesced_width=None, disable_tma=False, eviction_policy=None)`
+to move tiles between memory scopes. It accepts `tir.Buffer`, `BufferLoad`, or
+`BufferRegion`; extents are inferred or broadcast when possible.
+
+```python
+# Global → Shared tiles (extents inferred from dst)
+T.copy(A[by * BM, ko * BK], A_s)
+T.copy(B[ko * BK, bx * BN], B_s)
+
+# Fragment/Register → Global (store result)
+T.copy(C_f, C[by * BM, bx * BN])
+```
+
+Semantics
+- Extents are deduced from arguments; missing sides broadcast to the other’s rank.
+- Access patterns are legalized and coalesced during lowering. Explicit
+  vectorization is not required in HL mode.
+- Safety: the LegalizeSafeMemoryAccess pass inserts boundary guards when an
+  access may be out‑of‑bounds and drops them when proven safe.
+
+Other helpers
+- `T.c2d_im2col(img, col, ...)`: convenience for conv‑style transforms.
+
+## Compute Primitives
+
+GEMM and sparse GEMM
+- `T.gemm(A_shared, B_shared, C_fragment)`: computes a tile GEMM using shared
+  inputs and a fragment accumulator; lowered to target‑specific tensor cores.
+- `T.gemm_sp(...)`: 2:4 sparse tensor core variant (see examples and README).
+
+Reductions and scans
+- `T.reduce_sum`, `T.reduce_max`, `T.reduce_min`, `T.cumsum`, plus warp
+  reducers (`T.warp_reduce_sum`, etc.).
+- Allocate and initialize accumulators via `T.alloc_fragment` + `T.clear` or
+  `T.fill`.
+
+Elementwise math
+- Most math ops mirror TVM TIR: `T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`,
+  `T.sigmoid`, etc. Compose freely inside loops.
+
+Reshape/view (no copy)
+- `T.reshape(buf, new_shape)` and `T.view(buf, shape=None, dtype=None)` create
+  new views that share storage, with shape/dtype checks enforced.
+
+## Synchronization (HL usage)
+
+In HL pipelines, you usually don’t need to write explicit barriers. Passes such
+as PipelinePlanning/InjectSoftwarePipeline/InjectTmaBarrier orchestrate
+producer/consumer ordering and thread synchronization behind the scenes.
+
+If you need debugging or explicit checks:
+- `T.device_assert(cond, msg='')` emits device‑side asserts on CUDA targets.
+- `T.print(obj, msg='...')` prints scalars or buffers safely from one thread.
+
+## Putting It Together: GEMM Tile
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)  # Global → Shared
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)             # compute into fragment
+
+        T.copy(C_f, C[by * BM, bx * BN])      # store back
+```
+
+## Instruction Reference (Concise)
+
+Below is a concise list of TileLang instructions grouped by category. For full
+signatures, behaviors, constraints, and examples, refer to API Reference
+(`autoapi/tilelang/index`).
+
+Data movement
+- `T.copy(src, dst, ...)`: Move tiles between Global/Shared/Fragment.
+- `T.c2d_im2col(img, col, ...)`: 2D im2col transform for conv.
+
+Memory allocation and descriptors
+- `T.alloc_shared(shape, dtype, scope='shared.dyn')`: Allocate shared buffer.
+- `T.alloc_fragment(shape, dtype, scope='local.fragment')`: Allocate fragment.
+- `T.alloc_var(dtype, [init], scope='local.var')`: Scalar var buffer (1 elem).
+- `T.alloc_barrier(arrive_count)`: Shared barrier buffer.
+- `T.alloc_tmem(shape, dtype)`: Tensor memory (TMEM) buffer (Hopper+).
+- `T.alloc_reducer(shape, dtype, op='sum', replication=None)`: Reducer buf.
+- `T.alloc_descriptor(kind, dtype)`: Generic descriptor allocator.
+  - `T.alloc_wgmma_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_smem_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_instr_desc(dtype='uint32')`
+- `T.empty(shape, dtype='float32')`: Declare function output tensors.
+
+Compute primitives
+- `T.gemm(A_s, B_s, C_f)`: Tile GEMM into fragment accumulator.
+- `T.gemm_sp(...)`: Sparse (2:4) tensor core GEMM.
+- Reductions: `T.reduce_sum/max/min/abssum/absmax`, bitwise `and/or/xor`.
+- Scans: `T.cumsum`, finalize: `T.finalize_reducer`.
+- Warp reducers: `T.warp_reduce_sum/max/min/bitand/bitor`.
+- Elementwise math: TIR ops (`T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`, ...).
+- Fast math: `T.__log/__log2/__log10/__exp/__exp2/__exp10/__sin/__cos/__tan`.
+- IEEE math: `T.ieee_add/sub/mul/fmaf` (configurable rounding).
+- Helpers: `T.clear(buf)`, `T.fill(buf, value)`.
+- Views: `T.reshape(buf, shape)`, `T.view(buf, shape=None, dtype=None)`.
+
+Diagnostics
+- `T.print(obj, msg='')`: Print scalar/buffer from one thread.
+- `T.device_assert(cond, msg='')`: Device-side assert (CUDA).
+
+Logical helpers
+- `T.any_of(a, b, ...)`, `T.all_of(a, b, ...)`: Multi-term predicates.
+
+Annotation helpers
+- `T.use_swizzle(panel_size=..., enable=True)`: Rasterization hint.
+- `T.annotate_layout({...})`: Attach explicit layouts to buffers.
+- `T.annotate_safe_value(var, ...)`: Safety/const hints.
+- `T.annotate_l2_hit_ratio(buf, ratio)`: Cache behavior hint.
+
+Atomics
+- `T.atomic_add(dst, value, memory_order=None, return_prev=False, use_tma=False)`.
+- `T.atomic_addx2(dst, value, return_prev=False)`; `T.atomic_addx4(...)`.
+- `T.atomic_max(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_min(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_load(dst)`, `T.atomic_store(dst, value)`.
+
+Custom intrinsics
+- `T.dp4a(A, B, C)`: 4‑element dot‑product accumulate.
+- `T.clamp(x, lo, hi)`: Clamp to [lo, hi].
+- `T.loop_break()`: Break from current loop via intrinsic.
+
+Barriers, TMA, warp‑group
+- Barriers: `T.create_list_of_mbarrier(...)`, `T.get_mbarrier(i)`.
+- Parity ops: `T.mbarrier_wait_parity(barrier, parity)`, `T.mbarrier_arrive(barrier)`.
+- Expect tx: `T.mbarrier_expect_tx(...)`; sugar: `T.barrier_wait(id, parity=None)`.
+- TMA: `T.create_tma_descriptor(...)`, `T.tma_load(...)`,
+  `T.tma_store_arrive(...)`, `T.tma_store_wait(...)`.
+- Proxy/fences: `T.fence_proxy_async(...)`, `T.warpgroup_fence_operand(...)`.
+- Warp‑group: `T.warpgroup_arrive()`, `T.warpgroup_commit_batch()`,
+  `T.warpgroup_wait(num_mma)`, `T.wait_wgmma(id)`.
+
+Lane/warp index
+- `T.get_lane_idx(warp_size=None)`: Lane id in warp.
+- `T.get_warp_idx_sync(warp_size=None)`: Canonical warp id (sync).
+- `T.get_warp_idx(warp_size=None)`: Canonical warp id (no sync).
+- `T.get_warp_group_idx(warp_size=None, warps_per_group=None)`: Group id.
+
+Register control
+- `T.set_max_nreg(reg_count, is_inc)`, `T.inc_max_nreg(n)`, `T.dec_max_nreg(n)`.
+- `T.annotate_producer_reg_dealloc(n=24)`, `T.annotate_consumer_reg_alloc(n=240)`.
+- `T.no_set_max_nreg()`, `T.disable_warp_group_reg_alloc()`.
+
+ 
+
+## Notes on Dtypes
+
+Dtypes accept three equivalent forms:
+- String: `'float32'`
+- TileLang dtype: `T.float32`
+- Framework dtype: `torch.float32`
+All are normalized internally. See Type System for details.
diff --git a/docs/programming_guides/language_basics.md b/docs/programming_guides/language_basics.md
new file mode 100644
index 0000000000000000000000000000000000000000..1152680c970460f3c91f871d2b0e82ec73034918
--- /dev/null
+++ b/docs/programming_guides/language_basics.md
@@ -0,0 +1,234 @@
+# Language Basics
+
+This page introduces the core TileLang (tile‑lang) DSL that you’ll use to write
+high‑performance kernels. It focuses on how to define a kernel, express
+iteration, move data across memory scopes, and run it with JIT.
+
+The examples use the conventional aliases:
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+```
+
+## 1. Defining a Kernel with `@T.prim_func`
+
+TileLang kernels are TIR (TVM IR) functions produced by the `@T.prim_func`
+decorator. Arguments are annotated with shapes and dtypes via `T.Tensor` or
+`T.Buffer`.
+
+Note on dtypes
+- You can pass dtypes as a string (e.g., 'float32'), a TileLang dtype (e.g., `T.float32`),
+  or a framework dtype (e.g., `torch.float32`). TileLang normalizes all of these.
+  See Type System for details.
+
+```python
+@T.prim_func
+def add_kernel(
+    A: T.Tensor((N,), dtype),    # dtype could be 'float32' | T.float32 | torch.float32
+    B: T.Tensor((N,), dtype),
+    C: T.Tensor((N,), dtype),
+):
+    ...  # kernel body
+```
+
+- Shapes may be concrete integers or symbolic. For symbolic, you can pass
+  Python ints through the outer `@jit` wrapper (shown below), or annotate with
+  `T.dyn` when you want a named symbolic dimension.
+
+```python
+# Named symbolic dimension (optional)
+K = T.dyn['K']
+@T.prim_func
+def uses_dyn(A: T.Tensor((K,), 'float32')):
+    ...
+```
+
+### Dynamic symbolic dimensions: two ways
+
+TileLang supports two complementary ways to introduce symbolic (dynamic) dims:
+
+- Type-level annotations via `T.dyn[...]` (recommended for function signatures)
+  - Use in `T.Tensor((T.dyn['K'], ...), dtype)` or bind once then reuse (as above).
+  - Inside the kernel body, prefer reading from the buffer’s shape, e.g. `M = A.shape[0]`.
+
+- Term-level variables via `T.dynamic(name, dtype)`
+  - Creates a TIR `tir.Var` you can use directly in expressions/loops.
+  - Handy when you need to reference the dimension symbol in the body.
+
+```python
+# 1) Annotation-only symbol; read the bound size via shape
+K = T.dyn['K']  # dtype defaults to int32
+@T.prim_func
+def foo(A: T.Tensor((K,), 'float32')):
+    N = A.shape[0]
+    for i in T.serial(N):
+        ...
+
+# 2) Explicit Var symbol usable in the body
+K = T.dynamic('K', 'int32')   # or T.dynamic('K') defaults to int32
+@T.prim_func
+def bar(A: T.Tensor((K,), 'float32')):
+    for i in T.serial(K):
+        ...
+```
+
+Notes
+- `T.symbolic(name, dtype)` is a deprecated alias of `T.dynamic`; prefer `T.dynamic`.
+- Under `@jit`, concrete sizes come from the actual tensor arguments at the first call.
+- Symbols in annotations do not need to be separate kernel arguments; TileLang binds them from argument shapes.
+
+## 2. Launching Work with `T.Kernel`
+
+`with T.Kernel(...)` declares a launch context and creates block/thread
+bindings. For GPU backends, specify a grid and threads per block.
+
+```python
+with T.Kernel(grid_x, grid_y, threads=128) as (bx, by):
+    ...  # bx/by are blockIdx.x/y
+```
+
+You rarely need raw thread indices; most kernels use structured loops
+(`T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`) inside a `T.Kernel`.
+
+## 3. Loops and Control Flow
+
+Core loop constructs map to familiar hardware patterns:
+
+- `T.serial(start, stop[, step])`: plain for‑loop
+- `T.unroll(start, stop[, step])`: unrolled loop
+- `T.Parallel(ext0, ext1, ...)`: nested parallel loops (elementwise‑friendly)
+- `T.Pipelined(iters, num_stages=N)`: software pipelining for producer/consumer
+
+```python
+for i in T.serial(N):
+    ...
+
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+
+for k in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    # overlap copy/compute across stages
+    ...
+```
+
+Conditionals use standard Python `if`/`else`. Guard edges with predicates when
+tile sizes do not divide problem sizes evenly.
+
+## 4. Memory Scopes and Allocation
+
+TileLang exposes key software‑managed scopes:
+
+- Global: device memory (default for `T.Tensor` arguments)
+- Shared: on‑chip, block‑visible (`T.alloc_shared(shape, dtype)`)
+- Fragment and scalars: per‑thread fragments and scalar vars but in Shared View
+  (`T.alloc_fragment`, `T.alloc_var`)
+
+```python
+A_shared = T.alloc_shared((BM, BK), 'float16')
+B_shared = T.alloc_shared((BK, BN), 'float16')
+C_local  = T.alloc_fragment((BM, BN), 'float32')
+T.clear(C_local)  # zero accumulators
+```
+
+## 5. Moving Data: `T.copy`
+
+Use `T.copy(src, dst)` to move tiles between scopes. It accepts buffers,
+buffer regions, or buffer loads; extents are inferred or can be broadcast.
+
+```python
+# Global -> Shared (tile copy), extents inferred from dst
+T.copy(A[by * BM, ko * BK], A_shared)
+T.copy(B[ko * BK, bx * BN], B_shared)
+
+# Fragment -> Global (store back)
+T.copy(C_local, C[by * BM, bx * BN])
+```
+
+`T.copy` performs coalescing and scope‑specific lowering during compilation.
+
+## 6. A Minimal End‑to‑End Example (Vector Add)
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+
+@jit  # infers target from tensors at first call
+def add(N: int, block: int = 256, dtype: str = 'float32'):
+
+    @T.prim_func
+    def add_kernel(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block), threads=block) as bx:
+            for i in T.Parallel(block):
+                gi = bx * block + i
+                # Optional — LegalizeSafeMemoryAccess inserts a guard when an access may be OOB
+                C[gi] = A[gi] + B[gi]
+
+    return add_kernel
+
+# Host side (PyTorch shown; NumPy/DLPack also supported)
+import torch
+N = 1 << 20
+A = torch.randn(N, device='cuda', dtype=torch.float32)
+B = torch.randn(N, device='cuda', dtype=torch.float32)
+C = torch.empty(N, device='cuda', dtype=torch.float32)
+
+kernel = add(N)
+kernel(A, B, C)  # runs on GPU
+torch.testing.assert_close(C, A + B)
+```
+
+Notes
+- The `@jit` wrapper returns a callable kernel after the first compilation.
+- You can pass compile‑time tunables (tile sizes, dtypes) through the outer
+  Python function and bake them into the generated TIR.
+
+## 7. Tiled GEMM Skeleton
+
+Below is a minimal pattern for a tiled GEMM using shared memory staging and a
+fragment accumulator. It mirrors the quickstart style found in the repository.
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)  # lowered to tensor‑core/ISA specific kernels
+
+        T.copy(C_f, C[by * BM, bx * BN])
+```
+
+## 8. Debugging and Printing
+
+Use `T.print` inside a kernel for quick introspection. TileLang emits printing
+from a single thread for shared/fragment scopes to avoid floods.
+
+```python
+T.print(C_f, msg='accumulator:')
+T.print(A_s, msg='A tile:')
+T.print(C[0], msg='C[0] = ')
+```
+
+## 9. Where to Go Next
+
+- Control flow details: see Programming Guides → Control Flow
+- Memory topics: see Programming Guides → (removed cache/layout); basics are covered inline
+- Autotuning tile sizes and mappings: Programming Guides → Autotuning
+- Operator examples (GEMM, GEMV, attention): see Deep Learning Operators
diff --git a/docs/programming_guides/overview.md b/docs/programming_guides/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..64b6d20390bd7350001a9882abda10906cea2874
--- /dev/null
+++ b/docs/programming_guides/overview.md
@@ -0,0 +1,27 @@
+# Programming Guides Overview
+
+This section provides a practical guide to writing high‑performance kernels with Tile Language (tile‑lang).
+It mirrors the structure of a similar guide in another project and adapts it to tile‑lang concepts and APIs.
+
+- Audience: Developers implementing custom GPU/CPU kernels with tile‑lang
+- Prereqs: Basic Python, NumPy/Tensor concepts, and familiarity with GPU programming notions
+- Scope: Language basics, control flow, instructions, autotuning, and type system
+
+## What You’ll Learn
+- How to structure kernels with TileLang’s core DSL constructs
+- How to move data across global/shared/fragment and pipeline compute
+- How to apply autotuning to tile sizes and schedules
+- How to specify and work with dtypes in kernels
+
+## Suggested Reading Order
+1. Language Basics
+2. Control Flow
+3. Instructions
+4. Autotuning
+5. Type System
+
+## Related Docs
+- Tutorials: see existing guides in `tutorials/`
+- Operators: examples in `deeplearning_operators/`
+
+> NOTE: This is a draft scaffold. Fill in code snippets and benchmarks as APIs evolve.
diff --git a/docs/programming_guides/type_system.md b/docs/programming_guides/type_system.md
new file mode 100644
index 0000000000000000000000000000000000000000..32b9274d7c48bcd3b7ebfcdc4b35a56862b86f10
--- /dev/null
+++ b/docs/programming_guides/type_system.md
@@ -0,0 +1,42 @@
+# Type System
+
+This page lists the data types supported by TileLang and how to specify them in
+kernels. For full details and the authoritative list, see the API Reference
+(`autoapi/tilelang/index`) and `tilelang.language.v2.dtypes`.
+
+How to specify dtypes
+- Use any of the following forms; TileLang normalizes them internally:
+  - String: `'float32'`, `'int8'`, `'bfloat16'`, ...
+  - TileLang dtype object: `T.float32`, `T.int8`, `T.bfloat16`, ...
+  - Framework dtype: `torch.float32`, `torch.int8`, `torch.bfloat16`, ...
+
+Common scalar types
+- Boolean: `bool`
+- Signed integers: `int8`, `int16`, `int32`, `int64`
+- Unsigned integers: `uint8`, `uint16`, `uint32`, `uint64`
+- Floating‑point: `float16` (half), `bfloat16`, `float32`, `float64`
+
+Float8 and low‑precision families
+- Float8: `float8_e3m4`, `float8_e4m3`, `float8_e4m3b11fnuz`, `float8_e4m3fn`,
+  `float8_e4m3fnuz`, `float8_e5m2`, `float8_e5m2fnuz`, `float8_e8m0fnu`
+- Float6: `float6_e2m3fn`, `float6_e3m2fn`
+- Float4: `float4_e2m1fn`
+
+Vectorized element types (SIMD packs)
+- For many base types, vector‑packed variants are available by lane count:
+  `x2`, `x4`, `x8`, `x16`, `x32`, `x64`.
+- Examples:
+  - Integers: `int8x2`, `int8x4`, ..., `int32x2`, `int32x4`, ...
+  - Unsigned: `uint8x2`, `uint8x4`, ...
+  - Floats: `float16x2`, `float16x4`, `float32x2`, `float32x4`, ...
+  - Float8/6/4 families also provide `x2/x4/x8/x16/x32/x64` where applicable,
+    e.g., `float8_e4m3x2`, `float8_e4m3x4`, `float6_e2m3fnx8`, `float4_e2m1fnx16`.
+
+Notes
+- Availability of certain low‑precision formats (float8/6/4) depends on target
+  architecture and backend support.
+- Choose accumulation dtypes explicitly for mixed‑precision compute (e.g.,
+  GEMM with `float16` inputs and `float32` accumulators).
+- The complete, up‑to‑date list is exposed in
+  `tilelang.language.v2.dtypes` and rendered in the API Reference.
+
diff --git a/docs/tutorials/debug_tools_for_tilelang.md b/docs/tutorials/debug_tools_for_tilelang.md
index e18b13279548621afd34474a06b682e6ca0752a5..f8dfaab826e0c2896a922a915cce44b2f9ab8153 100644
--- a/docs/tutorials/debug_tools_for_tilelang.md
+++ b/docs/tutorials/debug_tools_for_tilelang.md
@@ -171,6 +171,32 @@ The output messages will include something like:
 msg='hello world' BlockIdx=(0, 0, 0), ThreadIdx=(0, 0, 0): 0
 ```
 
+### Visual Layout Inference For TileLang
+ The **Visual Layout Inference** tool automatically generates visual diagrams that illustrate the mapping between logical indices, thread IDs, and register file locations.
+
+When TileLang performs layout inference, it determines how fragment buffers are distributed across threads. The visual layout tool captures this information and generates:
+1. **Textual output**: A human-readable description of the layout mapping
+2. **Visual diagrams**: Color-coded plots showing the thread-to-data mapping
+
+The visual layout inference tool is controlled through the `TL_LAYOUT_VISUALIZATION_ENABLE` and `TL_LAYOUT_VISUALIZATION_FORMATS` pass configuration. By default, `TL_LAYOUT_VISUALIZATION_ENABLE` is **disabled** to avoid performance overhead during compilation.
+
+When enabled, `TL_LAYOUT_VISUALIZATION_FORMATS` accepts string values to control output formats:
+- "txt": Text output only (same as default)
+- "all": Generates all formats (TXT, PDF, PNG, SVG)
+- "png": Generate PNG format only
+- "pdf": Generate PDF format only
+- "svg": Generate SVG format only
+- "txt,svg": Generate multiple formats (comma-separated) in addition to text output
+
+The output messages of "txt" will include something like:
+```
+C_local inferenced layout:
+  Shape: [32, 32] -> [8]
+  Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+  Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+```
+
+
 ## Conclusion
 
 By carefully examining intermediate representations (IR) before final code generation—and by leveraging runtime printing through `T.print`—one can quickly diagnose where index calculations, copy logic, or other kernel operations deviate from the intended behavior. This two-pronged approach (inspecting IR transformations and using runtime prints) is often sufficient for resolving generation and correctness issues in TileLang programs.
diff --git a/docs/tutorials/logging.md b/docs/tutorials/logging.md
new file mode 100644
index 0000000000000000000000000000000000000000..5caf432801cd7d0dfa1b5dfb2eef5c9e824e937c
--- /dev/null
+++ b/docs/tutorials/logging.md
@@ -0,0 +1,118 @@
+Logging in Tilelang/TVM
+===================================================
+<div style="text-align: left;">
+<em>Author:</em> <a href="https://github.com/SiriusNEO">SiriusNEO</a>
+</div>
+
+## TVM Logging Overview
+
+Tilelang currently utilizes the logging system from TVM. The implementation can be found in:
+
+- [include/tvm/runtime/logging.h](https://github.com/apache/tvm/blob/main/include/tvm/runtime/logging.h): Macro definitions
+- [src/runtime/logging.cc](https://github.com/apache/tvm/blob/main/src/runtime/logging.cc): Logging logic implementation
+
+The design style is inspired by [Google's glog](https://google.github.io/glog/stable/).
+
+## Logging Categories
+
+There are three primary macro types:
+
+```c++
+LOG(INFO) << "aaa";
+DLOG(INFO) << "aaa";
+VLOG(1) << "aaa";
+```
+
+- **LOG**: Standard logging preserved in code for displaying necessary information at different levels during runtime. Most Tilelang C++ error reporting is implemented via `LOG(FATAL) << "error msg"`.
+- **DLOG**: Debug logging for developer debugging output. DLOG is controlled at build time by the TVM_LOG_DEBUG environment variable and is **eliminated in Release builds through dead code elimination**.
+    - The key difference between LOG(DEBUG) and DLOG is this build-time elimination. We recommend using DLOG over LOG(DEBUG), as the latter has overlapping functionality and gets compiled into the release runtime.
+- **VLOG**: [Verbose logging](https://google.github.io/glog/stable/logging/#verbose-logging), primarily for debugging. Its main feature is customizable verbosity levels. For example, VLOG(n) where n can be 1, 2, 3, 4, 5, or 6, enabling complex tracing requirements. In contrast, LOG and DLOG typically use predefined verbose levels like INFO and DEBUG.
+    - In practical Tilelang development, VLOG is used less frequently.
+    - TVM's VLOG is implemented using DLOG, thus inheriting DLOG's characteristics.
+
+Additional useful macros include various **CHECK** variants:
+
+```c++
+CHECK(cond) << "error msg";
+DCHECK(cond) << "error msg";
+ICHECK(cond) << "error msg";
+```
+
+The implementation routes errors to LogFatal:
+
+```c++
+#define CHECK(x)                                                \
+  if (!(x))                                                     \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: (" #x << ") is false: "
+```
+- **DCHECK**: Debug mode CHECK, only compiled in debug builds
+- **ICHECK**: Internal Check that should exist in Release builds. When ICHECK fails, the entire system should report an error.
+
+## Logging Verbose Levels
+
+TVM defines 5 levels for LOG and DLOG (adding DEBUG compared to glog):
+
+```c++
+#define TVM_LOG_LEVEL_DEBUG 0
+#define TVM_LOG_LEVEL_INFO 1
+#define TVM_LOG_LEVEL_WARNING 2
+#define TVM_LOG_LEVEL_ERROR 3
+#define TVM_LOG_LEVEL_FATAL 4
+```
+
+## Using Logging in TileLang Development
+
+### Guidelines
+
+For temporary debugging output in your code, there are no restrictions (you can even use std::cout). Just remember to remove it before submitting a PR.
+
+For meaningful logging that should remain in the Tilelang codebase:
+
+- Critical correctness checks: Use ICHECK with sufficient error messages to facilitate debugging when issues arise.
+- Complex Pass debugging: For passes requiring intermediate output that may need future review (e.g., LayoutInference), use DLOG.
+- General INFO/WARNING messages: Use standard LOG.
+
+### Enabling Log Output in Tilelang
+
+To specify current log level at runtime, we need to set the environment variable `TVM_LOG_LEVEL`. An example usage is:
+
+```c++
+TVM_LOG_DEBUG=1 python3 code.py
+```
+
+which enables all DEBUG/INFO (level <= 1) logs for all files.
+
+#### Detailed Rules for TVM_LOG_DEBUG Specification
+
+The parsing logic is in `logging.cc`. Reference: [HyperAI Zhihu Article](https://zhuanlan.zhihu.com/p/1933106843468665163).
+
+Launch Python with `TVM_LOG_DEBUG=<spec>`, where `<spec>` is a comma-separated list of level assignments in the form `<file_name>=<level>`. Important notes:
+
+- The special filename DEFAULT sets the LOG level for all files.
+- `<level>` can be set to -1 to disable LOG for that file.
+- `<file_name>` is the C++ source filename (e.g., .cc, not .h) relative to the `src/` directory in the TVM repository. The `src/` prefix is optional when specifying file paths.
+
+### Enabling Debug Mode
+
+To enable DLOG/DCHECK, developers need to first build Tilelang in Debug mode:
+
+```bash
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_CUDA=ON
+```
+
+Tilelang's CMake logic automatically adds the `TVM_LOG_DEBUG` macro, compiling all DLOG statements:
+
+```cmake
+target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
+```
+
+Then you also need to specify the runtime environment variables. For example, to use `DLOG(INFO) << "xxx"` for debugging, run your code with INFO level (1): `TVM_LOG_DEBUG=1`.
+
+:::{note}
+   **Important**: There are two TVM_LOG_DEBUG variables. (1) Compile-time macro: Determines whether debug content (like DLOG) is compiled into the .so file. Referenced in C++ source via #ifdef TVM_LOG_DEBUG. This is automatically enabled when using Debug build mode in CMake. (2) Runtime environment variable: Controls logging level at runtime. TVM provides a specification for this variable, allowing control over per-file logging levels.
+
+   These two should ideally have different names, but TVM uses the same name for both, which can cause confusion.
+:::
+
+
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index d47866e1e27df5dafcf6a27ebc156705de4a8bf3..788aec367c45a56bda3bd001c3c8b5e6c6ebfe45 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
@@ -11,22 +11,20 @@ import time
 
 
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K_ref = K.repeat_interleave(groups, dim=2)
     V_ref = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K_ref)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K_ref)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V_ref)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V_ref)
     lse = torch.logsumexp(scores, dim=-1).float()
     return output, lse
 
@@ -45,23 +43,23 @@ def get_fwd_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
@@ -85,23 +83,23 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -111,7 +109,7 @@ def fast_flashattn(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx_loop_var = T.alloc_var("int32")
+            bx_loop_var = T.alloc_var(T.int32)
             bx_loop_var = b_split
 
             with T.While(bx_loop_var < num_q_blocks):
@@ -135,33 +133,21 @@ def fast_flashattn(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = (
-                    T.ceildiv(q_block_offset +
-                              block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -178,6 +164,8 @@ def fast_flashattn(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         if m_prev[i] == -T.infinity(accum_dtype):
@@ -214,8 +202,7 @@ def fast_flashattn(
 
                 for i in T.Parallel(block_M):
                     if q_block_offset + i < seq_len:
-                        lse_val = T.if_then_else(l_i[i] > 0,
-                                                 T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
+                        lse_val = T.if_then_else(l_i[i] > 0, T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
                         LSE[bz, by, q_block_offset + i] = lse_val
 
                 bx_loop_var = current_bx + num_split_q
@@ -232,30 +219,30 @@ def get_bwd_configs():
     panel_size = [7, 8, 9, 10]
 
     configs = []
-    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads,
-                                                   enable_rasterization, panel_size):
-        configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_stages": stages,
-            "threads": t,
-            "enable_rasterization": r,
-            "panel_size": p,
-        })
+    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads, enable_rasterization, panel_size):
+        configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_stages": stages,
+                "threads": t,
+                "enable_rasterization": r,
+                "panel_size": p,
+            }
+        )
 
     return configs
 
 
 @tilelang.jit(out_idx=[2])
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
-    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype),
-                       Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
+    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype), Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
         with T.Kernel(batch, heads, T.ceildiv(seq_len, blk)) as (bz, bx, by):
             o = T.alloc_fragment([blk, blk], dtype)
             do = T.alloc_fragment([blk, blk], dtype)
@@ -263,36 +250,51 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 @tilelang.autotune(configs=get_bwd_configs(), cache_input_tensors=True)
 @tilelang.jit
-def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, block_N: int,
-                  num_stages: int, threads: int, enable_rasterization: bool, panel_size: int):
-    sm_scale = (1.0 / dim)**0.5
+def flashattn_bwd(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    groups,
+    block_M: int,
+    block_N: int,
+    num_stages: int,
+    threads: int,
+    enable_rasterization: bool,
+    panel_size: int,
+):
+    sm_scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
-    def flash_bwd_kernel(Q: T.Tensor(q_shape,
-                                     dtype), K: T.Tensor(kv_shape,
-                                                         dtype), V: T.Tensor(kv_shape, dtype),
-                         dO: T.Tensor(q_shape, dtype), lse: T.Tensor([batch, heads, seq_len],
-                                                                     accum_dtype),
-                         Delta: T.Tensor([batch, heads, seq_len],
-                                         accum_dtype), dQ: T.Tensor(q_shape, accum_dtype),
-                         dK: T.Tensor(kv_shape, accum_dtype), dV: T.Tensor(kv_shape, accum_dtype)):
+    def flash_bwd_kernel(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        dO: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+        dQ: T.Tensor(q_shape, accum_dtype),
+        dK: T.Tensor(kv_shape, accum_dtype),
+        dV: T.Tensor(kv_shape, accum_dtype),
+    ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             T.use_swizzle(panel_size, enable=enable_rasterization)
 
@@ -313,8 +315,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
             dk = T.alloc_fragment([block_M, dim], accum_dtype)
             dq = T.alloc_fragment([block_N, dim], accum_dtype)
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
@@ -322,22 +324,21 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
             loop_ed = T.ceildiv(seq_len, block_N)
 
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q_shared)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q_shared)
                 T.clear(qkT)
 
                 T.gemm(K_shared, q_shared, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     P_acc[i, j] = T.exp(qkT[i, j] * sm_scale - lse_shared[j])
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j,
-                                                     P_acc[i, j], 0.0)
+                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, P_acc[i, j], 0.0)
 
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do_shared)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do_shared)
                 T.clear(dP)
 
                 T.gemm(V_shared, do_shared, dP, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -345,7 +346,7 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
                 T.copy(P_acc, p_cast)
                 T.gemm(p_cast, do_shared, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta_shared)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     p_cast[i, j] = P_acc[i, j] * (dP[i, j] - delta_shared[j]) * sm_scale
@@ -367,8 +368,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
 
 @tilelang.jit(out_idx=[1])
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
@@ -376,8 +377,8 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
     def flash_bwd_post(dQ_in: T.Tensor(shape, accum_dtype), dQ_out: T.Tensor(shape, dtype)):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.copy(
-                dQ_in[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ_in[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
@@ -444,22 +445,14 @@ def benchmark_function(func, *args, warmup=10, repeat=100):
     return np.median(times)
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     device = "cuda"
     dtype = torch.float16
 
     torch.manual_seed(42)
     torch.cuda.manual_seed(42)
 
-    print(
-        f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}"
-    )
+    print(f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}")
 
     flops_per_gemm = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 5 * flops_per_gemm
@@ -515,22 +508,19 @@ def main(batch: int = 1,
     o_ref.backward(dO)
 
     print("Verifying backward pass correctness...")
-    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(
-        dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
+    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
     if dq_close:
         print("dQ is correct.")
     else:
         print("dQ mismatch detected.")
 
-    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(
-        dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
+    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
     if dk_close:
         print("dK is correct.")
     else:
         print("dK mismatch detected.")
 
-    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(
-        dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
+    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
     if dv_close:
         print("dV is correct.")
     else:
@@ -551,9 +541,7 @@ def main(batch: int = 1,
             torch.cuda.synchronize()
 
     ref_latency = benchmark_function(run_reference_fwd_bwd, warmup=10, repeat=100)
-    print(
-        f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops")
 
     def run_complete_fwd_bwd():
         o_tl_bench, lse_tl_bench = fwd_kernel(q, k, v)
@@ -591,12 +579,12 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=1024, help='sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=1024, help="sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 6ec5db1e506dc68a1c527ef39c09fc1ac4e278c2..ca9c361ff1235a3f7f49b2900b5c5ee868d92a2e 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -2,29 +2,42 @@ import torch
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
 
 
+# Custom supply function to ensure tensors are created on GPU
+def supply_tensors_gpu(params):
+    """Supply function that creates tensors on GPU for ROCm/HIP."""
+    tensors = []
+    for param in params:
+        if hasattr(param, "shape") and hasattr(param, "dtype"):
+            # Force creation on GPU device
+            shape = [int(s) for s in param.shape]
+            tensor = torch.randn(shape, dtype=param.dtype, device="cuda")
+            tensors.append(tensor)
+        else:
+            tensors.append(param)
+    return tensors
+
+
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -43,27 +56,27 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
-@tilelang.autotune(configs=get_configs(), cache_input_tensors=True)
+@tilelang.autotune(configs=get_configs(), cache_input_tensors=True, supply_prog=supply_tensors_gpu)
 @tilelang.jit(out_idx=[3])
 def fast_flashattn(
     batch,
@@ -83,22 +96,22 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -108,7 +121,7 @@ def fast_flashattn(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx = T.alloc_var("int32")
+            bx = T.alloc_var(T.int32)
             bx = b_split
 
             with T.While(bx < num_q_blocks):
@@ -132,32 +145,21 @@ def fast_flashattn(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = T.ceildiv(q_block_offset + block_M,
-                                       block_N) if is_causal else T.ceildiv(seq_len, block_N)
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -171,6 +173,8 @@ def fast_flashattn(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         sf = T.exp(m_prev[i] * scale - m_i[i] * scale)
@@ -205,13 +209,7 @@ def fast_flashattn(
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
@@ -233,18 +231,16 @@ def main(batch: int = 1,
     print(f"Reference (PyTorch): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
     latency = profiler.do_bench(warmup=100)
-    print(
-        f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/analyze/README.md b/examples/analyze/README.md
index 8171d8826808d548ca14cfeb7ac7dc5861dbf868..9ec0a687547701d6e3a125620306d54fb4e85a9c 100644
--- a/examples/analyze/README.md
+++ b/examples/analyze/README.md
@@ -21,9 +21,9 @@ M = N = K = 1024
 
 def kernel(block_M=128, block_N=128, block_K=32, num_stages=3, thread_num=128):
     @T.prim_func
-    def main(A: T.Tensor((M, K), "float16"),
-             B: T.Tensor((N, K), "float16"),
-             C: T.Tensor((M, N), "float")):
+    def main(A: T.Tensor((M, K), T.float16),
+             B: T.Tensor((N, K), T.float16),
+             C: T.Tensor((M, N), T.float)):
         # ... (kernel definition)
     return main
 
@@ -40,9 +40,9 @@ from tilelang.carver.arch import CUDA
 
 def kernel(N=64, C=256, H=512, W=512, F=512, K=3, block_M=64, block_N=128):
     @T.prim_func
-    def main(data: T.Tensor((N, H, W, C), "float16"),
-             kernel: T.Tensor((K, K, C, F), "float16"),
-             out: T.Tensor((N, (H-K+1), (W-K+1), F), "float")):
+    def main(data: T.Tensor((N, H, W, C), T.float16),
+             kernel: T.Tensor((K, K, C, F), T.float16),
+             out: T.Tensor((N, (H-K+1), (W-K+1), F), T.float)):
         # ... (convolution kernel definition)
     return main
 
diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
index 540fcf4b742ccd2a9fccd88051135d364d6ab10e..db21e02f62bc0ad281848a8085d6b661d2d4e93c 100644
--- a/examples/analyze/example_conv_analyze.py
+++ b/examples/analyze/example_conv_analyze.py
@@ -25,38 +25,21 @@ def check_hopper():
     return False
 
 
-def kernel(N,
-           C,
-           H,
-           W,
-           F,
-           K,
-           S,
-           D,
-           P,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           threads,
-           dtype="float16",
-           accum_dtype="float"):
+def kernel(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def conv(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -65,11 +48,13 @@ def kernel(N,
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: make_swizzled_layout(out_shared),
-                data_shared: make_swizzled_layout(data_shared),
-                kernel_shared: make_swizzled_layout(kernel_shared),
-            })
+            T.annotate_layout(
+                {
+                    out_shared: make_swizzled_layout(out_shared),
+                    data_shared: make_swizzled_layout(data_shared),
+                    kernel_shared: make_swizzled_layout(kernel_shared),
+                }
+            )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -81,10 +66,8 @@ def kernel(N,
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
index bfd934f6aa692dd56ef578b9affa27faf0e809fe..0367af126e04d631d53c9d63eb6f269b807dcf2d 100644
--- a/examples/analyze/example_gemm_analyze.py
+++ b/examples/analyze/example_gemm_analyze.py
@@ -15,14 +15,14 @@ def kernel(
     thread_num=None,
     enable_rasteration=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/attention_sink/benchmark_gqa_sink_fwd.py b/examples/attention_sink/benchmark_gqa_sink_fwd.py
index 1b7de6b6f2fdff4d3877abf65158a013d3b2c6f1..211ef1d18cda28f20c6104b17f0330322a437d3f 100644
--- a/examples/attention_sink/benchmark_gqa_sink_fwd.py
+++ b/examples/attention_sink/benchmark_gqa_sink_fwd.py
@@ -1,6 +1,7 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
@@ -51,8 +52,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -120,7 +120,8 @@ def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.T
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
@@ -135,14 +136,14 @@ def main(
     dtype: str = "float16",
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -170,15 +171,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         if torch.allclose(
-                triton_program(Q, K, V, sinks, window_size),
-                ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-                rtol=1e-2,
-                atol=1e-2):
+            triton_program(Q, K, V, sinks, window_size), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        ):
             print("Checks for triton passed.✅")
         else:
             print("Checks for triton failed.❌")
@@ -198,20 +198,14 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/benchmark_mha_sink_fwd.py b/examples/attention_sink/benchmark_mha_sink_fwd.py
index f50b945356b2e35ef3d7c889a1e8bd2555907bb1..50747e6b09d902668ec99ee5c267c9dccadf208f 100644
--- a/examples/attention_sink/benchmark_mha_sink_fwd.py
+++ b/examples/attention_sink/benchmark_mha_sink_fwd.py
@@ -1,6 +1,7 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
@@ -50,8 +51,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -117,26 +117,29 @@ def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.T
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -163,15 +166,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: triton_program(Q, K, V, sinks, window_size), warmup=500)
@@ -184,19 +186,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index eec43db996f17b9f235e324ca4663ed1e5a333c7..541baca0430a4378220ce8b48868e64d4014e5dc 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -20,43 +20,45 @@ def get_bwd_configs():
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        groups=1,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    groups=1,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            Output: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        Output: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -73,7 +75,7 @@ def flashattn_fwd(
             sinks = T.alloc_fragment([heads], dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -81,31 +83,30 @@ def flashattn_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.max(0,
-                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
-                T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -122,32 +123,33 @@ def flashattn_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -156,65 +158,61 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16")
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim,
-                  groups,
-                  window_size=None,
-                  sm_scale=None,
-                  dtype="float16"):  # None for full attention
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale=None, dtype=T.float16):  # None for full attention
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
@@ -223,15 +221,15 @@ def flashattn_bwd(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            dO: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        dO: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -251,44 +249,47 @@ def flashattn_bwd(batch,
             dv_shared = T.alloc_shared([block_M, dim], accum_dtype)
             dk_shared = T.alloc_shared([block_M, dim], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx // groups, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx // groups, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+            T.copy(K[bz, bx // groups, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx // groups, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.min(
-                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
-                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -297,27 +298,27 @@ def flashattn_bwd(batch,
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dv_shared)
+            T.atomic_add(dV[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dk_shared)
+            T.atomic_add(dK[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dk_shared)
 
     return flash_bwd
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=256) as (bx, by, bz):
             sink = T.alloc_local([1], dtype)
@@ -326,21 +327,18 @@ def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: str = "float16"
             dsink_fragment = T.alloc_fragment([block], dtype)
 
             sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size, groups):
-
         def maybe_contiguous(x):
             if x.stride(-1) != 1:
                 return x.contiguous()
@@ -348,7 +346,7 @@ class _attention(torch.autograd.Function):
 
         q, k, v, sinks = [maybe_contiguous(x) for x in (q, k, v, sinks)]
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -361,7 +359,7 @@ class _attention(torch.autograd.Function):
         q, k, v, sinks, o, lse = ctx.saved_tensors
         BATCH, H, N_CTX, D_HEAD = q.shape
         groups = ctx.groups
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
 
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
@@ -386,13 +384,14 @@ attention = _attention.apply
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -428,32 +427,32 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 8,
-         N_CTX: int = 512,
-         D_HEAD: int = 64,
-         groups: int = 2,
-         window_size: Optional[int] = None,
-         dtype: str = "float16"):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    BATCH: int = 1,
+    H: int = 8,
+    N_CTX: int = 512,
+    D_HEAD: int = 64,
+    groups: int = 2,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
-    K = torch.randn(
-        BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    K = torch.randn(BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     V = torch.randn_like(K).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device="cuda").requires_grad_()
     dO = torch.randn_like(Q)
@@ -474,19 +473,14 @@ def main(BATCH: int = 1,
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -507,17 +501,12 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument('--groups', type=int, default=8, help='Groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--groups", type=int, default=8, help="Groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.groups, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
index 7765603af58ad2aefec658e162828f068fd292be..df157cd0ff396c3f5e358c71334dc77695e72315 100644
--- a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -23,9 +23,11 @@ def get_configs():
     rep=100,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -39,20 +41,19 @@ def flashattn(
     block_N=128,
     num_stages=2,
     threads=256,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
-
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, head_kv, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -68,13 +69,12 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
         for i, j in T.Parallel(block_M, block_N):
             q_idx = bx * block_M + i + past_len
             k_idx = k * block_N + j
             if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
             else:
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -89,29 +89,30 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
         for i in T.Parallel(block_M):
             if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
+                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
             scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
         for i, j in T.Parallel(block_M, block_N):
@@ -126,19 +127,19 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -155,58 +156,58 @@ def flashattn(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: make_swizzled_layout(Q_shared),
+                    K_shared: make_swizzled_layout(K_shared),
+                    V_shared: make_swizzled_layout(V_shared),
+                    O_shared: make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start,
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -242,23 +243,15 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        groups,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, groups, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
@@ -270,17 +263,17 @@ def main(
     dim: int = 128,
     groups: int = 8,
     window_size: Optional[int] = None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -308,15 +301,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         # Benchmark tilelang
@@ -327,20 +319,14 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index 866668e413a9413390367d7b06ba90f2486db94a..be405e8bc3c986d0b3241d650c1ab1652e1081ec 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -20,40 +20,42 @@ def get_bwd_configs():
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        window_size=None,  # None for full attention,
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    window_size=None,  # None for full attention,
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -70,7 +72,7 @@ def flashattn_fwd(
             sinks = T.alloc_fragment([heads], dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -78,31 +80,30 @@ def flashattn_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.max(0,
-                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -119,32 +120,33 @@ def flashattn_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -153,49 +155,52 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16")
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(
     batch,
     heads,
@@ -203,32 +208,31 @@ def flashattn_bwd(
     dim,
     window_size=None,  # None for full attention
     sm_scale=None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
-
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -252,43 +256,46 @@ def flashattn_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.min(
-                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
-                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, B=do, C=dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -297,27 +304,27 @@ def flashattn_bwd(
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=128) as (bx, by, bz):
             sink = T.alloc_local([1], dtype)
@@ -326,22 +333,20 @@ def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: str = "float16"
             dsink_fragment = T.alloc_fragment([block], accum_dtype)
 
             sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size):
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -359,7 +364,7 @@ class _attention(torch.autograd.Function):
             return x
 
         do, q, k, v, sinks, o = [maybe_contiguous(x) for x in (do, q, k, v, sinks, o)]
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         delta = kernel_prep(o, do)
@@ -381,15 +386,15 @@ attention = _attention.apply
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -424,29 +429,23 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 1,
-         N_CTX: int = 512,
-         D_HEAD: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16"):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(BATCH: int = 1, H: int = 1, N_CTX: int = 512, D_HEAD: int = 128, window_size: Optional[int] = None, dtype: T.dtype = T.float16):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     K = torch.randn_like(Q).requires_grad_()
     V = torch.randn_like(Q).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device=Q.device).requires_grad_()
@@ -468,19 +467,14 @@ def main(BATCH: int = 1,
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -501,16 +495,11 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
index 2449b090cbe7153f559afe90bf50e6df594e4dd0..f6754bd94acf6fe9ca440cc9058ed7080b5ed267 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -18,31 +18,34 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -58,13 +61,12 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         for i, j in T.Parallel(block_M, block_N):
             q_idx = bx * block_M + i + past_len
             k_idx = k * block_N + j
             if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
             else:
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -79,29 +81,30 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
         for i in T.Parallel(block_M):
             if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
+                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
             scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
         for i, j in T.Parallel(block_M, block_N):
@@ -116,19 +119,19 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -145,53 +148,51 @@ def flashattn(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: make_swizzled_layout(Q_shared),
+                    K_shared: make_swizzled_layout(K_shared),
+                    V_shared: make_swizzled_layout(V_shared),
+                    O_shared: make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Modified from https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -226,41 +227,36 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 1,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 1,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: T.dtype = T.float16,
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -287,19 +283,17 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
-        latency = do_bench(
-            lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
+        latency = do_bench(lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
         print("Ref: {:.2f} ms".format(latency))
         print("Ref: {:.2f} TFlops".format(total_flops / latency * 1e-9))
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -309,19 +303,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
index 352844075a7d3d5f896c63d76ea1bbb10784bf40..ecaf2ce33941587ceeefc49627424f58adeca276 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -19,33 +19,35 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=128,
-        block_N=128,
-        num_stages=2,
-        threads=256,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=128,
+    block_N=128,
+    num_stages=2,
+    threads=256,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -61,13 +63,12 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         for i, j in T.Parallel(block_M, block_N):
             q_idx = bx * block_M + i + past_len
             k_idx = k * block_N + j
             if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
             else:
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -82,29 +83,30 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
         for i in T.Parallel(block_M):
             if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
+                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
             scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
         for i, j in T.Parallel(block_M, block_N):
@@ -119,19 +121,19 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -148,60 +150,59 @@ def flashattn(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: make_swizzled_layout(Q_shared),
+                    K_shared: make_swizzled_layout(K_shared),
+                    V_shared: make_swizzled_layout(V_shared),
+                    O_shared: make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start,
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function'sinterface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function'sinterface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -236,41 +237,36 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: T.dtype = T.float16,
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -297,15 +293,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -315,19 +310,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/bitnet-1.58b/benchmark_generate.py b/examples/bitnet-1.58b/benchmark_generate.py
index d6f21ed502772af054ede58d8edce25a2f5879a0..d678b91a4e1c970e2209d2dfc0a102af4c3cf81b 100644
--- a/examples/bitnet-1.58b/benchmark_generate.py
+++ b/examples/bitnet-1.58b/benchmark_generate.py
@@ -12,8 +12,7 @@ bitblas.set_log_level("INFO")
 
 def generate_text_batch(model, tokenizer, prompts, max_length=100):
     # Encode the input prompts as a batch
-    input_ids = tokenizer(
-        prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
+    input_ids = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
 
     # Generate cos and sin values (commented out as not used in generation)
     seq_length = input_ids.size(1)
@@ -37,9 +36,7 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
     end_time = time.time()
 
     # Decode the output ids to text
-    generated_texts = [
-        tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids
-    ]
+    generated_texts = [tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids]
 
     generation_time = end_time - start_time
     num_tokens = sum(len(output_id) for output_id in output_ids)
@@ -52,8 +49,8 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -74,25 +71,29 @@ def profile(model, input_data):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--bs', default=16, type=int)
-    parser.add_argument('--in_seq_len', default=32, type=int)
-    parser.add_argument('--out_seq_len', default=128, type=int)
-    parser.add_argument('--bitblas', action='store_true')
+    parser.add_argument("--bs", default=16, type=int)
+    parser.add_argument("--in_seq_len", default=32, type=int)
+    parser.add_argument("--out_seq_len", default=128, type=int)
+    parser.add_argument("--bitblas", action="store_true")
     args = parser.parse_args()
     bs = args.bs
     in_seq_len = args.in_seq_len
     out_seq_len = args.out_seq_len
     is_bitblas = args.bitblas
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     if is_bitblas:
         with torch.no_grad():
             model.quantize()
@@ -109,5 +110,5 @@ def main():
     print(generate_text_batch(model, tokenizer, prompts, max_length=max_length))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/benchmark_inference_latency.py b/examples/bitnet-1.58b/benchmark_inference_latency.py
index 9ce7a3898cba0cb6bdcf09677da2a6123f075a2d..788fc5565d5d58b59ef11a11b33f357e911ba9bc 100644
--- a/examples/bitnet-1.58b/benchmark_inference_latency.py
+++ b/examples/bitnet-1.58b/benchmark_inference_latency.py
@@ -6,13 +6,14 @@ from modeling_bitnet import BitnetForCausalLM
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,8 +36,8 @@ def profile(model, input_data):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
@@ -52,5 +53,5 @@ def main():
         print(f"Batch size: {batch_size}, Seq len: {seq_len}, Latency: {latency}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/configuration_bitnet.py b/examples/bitnet-1.58b/configuration_bitnet.py
index 5f4937b87bf483e8c75fd4f7ba8725a845faa512..63c499db36d96d50f567794bf80a60882e08114f 100644
--- a/examples/bitnet-1.58b/configuration_bitnet.py
+++ b/examples/bitnet-1.58b/configuration_bitnet.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LLaMA model configuration"""
+"""LLaMA model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -180,16 +180,10 @@ class BitnetConfig(PretrainedConfig):
             return
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}")
+            raise ValueError(f"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, got {self.rope_scaling}")
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor,
-                                                         float) or rope_scaling_factor <= 1.0:
-            raise ValueError(
-                f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/examples/bitnet-1.58b/eval_correctness.py b/examples/bitnet-1.58b/eval_correctness.py
index ac1e340729fc46b0bed8957bb26aedd20d5af2c9..11d47004b81edf517d442cb0eb2b70e6c583cce0 100644
--- a/examples/bitnet-1.58b/eval_correctness.py
+++ b/examples/bitnet-1.58b/eval_correctness.py
@@ -47,8 +47,8 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -69,18 +69,22 @@ def profile(model, input_data):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=False,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=False,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
 
     tokenizer = BitnetTokenizer.from_pretrained(model_path, use_fast=False)
-    input_id = tokenizer("Hello")['input_ids']
+    input_id = tokenizer("Hello")["input_ids"]
     input_id = torch.tensor(input_id).unsqueeze(0).cuda()
 
     print("original model generated text:")
@@ -91,5 +95,5 @@ def main():
     print(generate_text(model, tokenizer, "Hello", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_gpu_memory.py b/examples/bitnet-1.58b/eval_gpu_memory.py
index 597cbbfcdaaaa1206ceacae45b235b6709f7f61e..00c914cb31c919fc536d0705f59cacf29a30e287 100644
--- a/examples/bitnet-1.58b/eval_gpu_memory.py
+++ b/examples/bitnet-1.58b/eval_gpu_memory.py
@@ -6,13 +6,14 @@ from modeling_bitnet import BitnetForCausalLM
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,17 +36,17 @@ def profile(model, input_data):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
     ).half()
-    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024**3} GB")
     with torch.no_grad():
         model._post_process_weights()
-    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024**3} GB")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_ppl.py b/examples/bitnet-1.58b/eval_ppl.py
index 61c8488e46707b5e05f3982c8046618ded6b4098..97db2d0f5236f369a33f70ac1b07fe9a8c01df9d 100644
--- a/examples/bitnet-1.58b/eval_ppl.py
+++ b/examples/bitnet-1.58b/eval_ppl.py
@@ -15,9 +15,9 @@ from tqdm import tqdm
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--seed', default=0, type=int)
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
-parser.add_argument('--seqlen', default=2048, type=int)
+parser.add_argument("--seed", default=0, type=int)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
+parser.add_argument("--seqlen", default=2048, type=int)
 
 
 def calulate_loss(model, input, loss_fct):
@@ -29,12 +29,16 @@ def calulate_loss(model, input, loss_fct):
 
 
 def main(args):
-    datasets = ['c4', 'wikitext2']
-    model = BitnetForCausalLM.from_pretrained(
-        args.hf_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    datasets = ["c4", "wikitext2"]
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            args.hf_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     with torch.no_grad():
         model._post_process_weights()
     tokenizer = BitnetTokenizer.from_pretrained(args.hf_path, use_fast=False)
@@ -48,9 +52,9 @@ def main(args):
         for ii in progress:
             input = torch.Tensor(testdata[ii]).long().cuda().view(1, -1)
             loss = calulate_loss(model, input, loss_fct)
-            count += (input.size(-1) - 1)
+            count += input.size(-1) - 1
             acc_loss += loss.item()
-            progress.set_description(f"avg_loss = {acc_loss/ count / math.log(2)}")
+            progress.set_description(f"avg_loss = {acc_loss / count / math.log(2)}")
 
         avg_loss = acc_loss / count / math.log(2)
         ppl.append(2**avg_loss)
@@ -60,7 +64,7 @@ def main(args):
     print("Avg PPL:", sum(ppl) / len(ppl))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     torch.set_grad_enabled(False)
     args = parser.parse_args()
     random.seed(args.seed)
diff --git a/examples/bitnet-1.58b/eval_utils.py b/examples/bitnet-1.58b/eval_utils.py
index 46241eedf0a4224c7aae8ceb4a6549350d513fc2..72480c392a7cfa40081546d2da19aa31463aab76 100644
--- a/examples/bitnet-1.58b/eval_utils.py
+++ b/examples/bitnet-1.58b/eval_utils.py
@@ -15,21 +15,17 @@ def set_seed(seed):
 
 def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
     if dataset_name == "wikitext2":
-        testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-        testdata = "".join(testdata['text']).split('\n')
+        testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        testdata = "".join(testdata["text"]).split("\n")
     elif dataset_name == "c4":
-        testdata = load_dataset(
-            'allenai/c4',
-            data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
-            split='validation')['text']
+        testdata = load_dataset("allenai/c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation")[
+            "text"
+        ]
     else:
         raise NotImplementedError
 
     testdata = [item for item in testdata if item != ""]
-    tokenized_text = [
-        tokenizer(item, add_special_tokens=False)['input_ids'] + [tokenizer.eos_token_id]
-        for item in testdata
-    ]
+    tokenized_text = [tokenizer(item, add_special_tokens=False)["input_ids"] + [tokenizer.eos_token_id] for item in testdata]
 
     data, doc = [], [tokenizer.bos_token_id]
     for sen in tokenized_text:
@@ -45,7 +41,6 @@ def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
 
 
 class LMEvalAdaptor(BaseLM):
-
     def __init__(self, model_name, model, tokenizer, batch_size=1, max_length=-1):
         super().__init__()
 
@@ -137,5 +132,4 @@ class LMEvalAdaptor(BaseLM):
         return out
 
     def _model_generate(self, context, max_length, eos_token_id):
-        return self.model.generate(
-            context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
+        return self.model.generate(context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
index e5af16cc486a107c635ea1d7d88477cd479a4cb9..7b8b7b95cdb24f1bba466a8e776796b7ab025315 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
@@ -76,13 +76,13 @@ def bitnet_158_int8xint2_decode(
     reduce_thread=32,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     storage_nbit = 8
     num_bits = 2
@@ -94,7 +94,7 @@ def bitnet_158_int8xint2_decode(
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     micro_size_k = MAX_TRANSACTION_SIZE_IN_BITS // DataType(in_dtype).bits
     micro_size_k_compressed = micro_size_k // num_elems_per_byte
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     block_K = reduce_thread * micro_size_k
 
     use_dp4a = True
@@ -102,17 +102,17 @@ def bitnet_158_int8xint2_decode(
 
     @T.prim_func
     def kernel(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer(C_shape, out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer(C_shape, out_dtype),
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -133,8 +133,7 @@ def bitnet_158_int8xint2_decode(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 T.call_extern(
@@ -156,9 +155,9 @@ def bitnet_158_int8xint2_decode(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -168,7 +167,8 @@ def bitnet_158_int8xint2_decode(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -194,12 +194,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -209,7 +209,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -217,12 +217,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -234,13 +234,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_decode_correctness(M,
-                                                   N,
-                                                   K,
-                                                   in_dtype,
-                                                   out_dtype,
-                                                   accum_dtype,
-                                                   fast_decoding=True):
+def assert_bitnet_158_int8xint2_decode_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_decode(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
@@ -265,4 +259,4 @@ def assert_bitnet_158_int8xint2_decode_correctness(M,
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
index d8b1f6228e7d0a386e71db3d4482dfe3e5290873..8c337398233f32905bd4dd929490287d38660126 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -8,11 +8,13 @@ import tilelang.language as T
 from tilelang import tvm as tvm
 from tvm import DataType
 from tilelang.intrinsics.mma_layout import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
 import numpy as np
 
 from tilelang.intrinsics.mma_macro_generator import (
-    INT4TensorCoreIntrinEmitter,)
+    INT4TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 torch.manual_seed(42)
@@ -86,9 +88,9 @@ def bitnet_158_int8xint2_prefill(
     Create a TVM GPU prim_func implementing a block-tiled matrix multiply that multiplies dense A by compressed/interleaved low‑precision B (2-bit packed into int8 storage), decoding B to int8 on-chip and accumulating into C.
 
     The returned prim_func expects:
-    - A: shape (M, K) with dtype `in_dtype` ("float16" or "int8").
+    - A: shape (M, K) with dtype `in_dtype` (T.float16 or T.int8).
     - B: compressed storage with shape (N, K/4) and int8 storage layout (packing 4 2-bit elements per byte).
-    - C: output buffer shape (M, N) with dtype `out_dtype` ("float16", "float32", or "int32").
+    - C: output buffer shape (M, N) with dtype `out_dtype` (T.float16, T.float32, or T.int32).
 
     Details:
     - Builds a tiled, pipelined kernel using shared memory and warp-level MMA intrinsics (INT4TensorCoreIntrinEmitter). B is loaded from compressed storage, decoded to int8 in threads (via decode_i2u_to_i8s / decode_i2s_to_i8s), and dequantized into a shared buffer used by the MMA emitter.
@@ -96,15 +98,15 @@ def bitnet_158_int8xint2_prefill(
       - block_row_warps, block_col_warps: number of warps per block in row/col.
       - warp_row_tiles, warp_col_tiles: tiles per warp.
       - chunk: K-sized chunk per block (block_K).
-      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == "int32").
+      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == T.int32).
     - Uses 2-stage pipelining by default to overlap loads and compute and applies a swizzle layout to improve L2 behavior.
     - Assertions: raises AssertionError if in_dtype or out_dtype are not among supported values.
 
     Parameters:
         M, N, K (int): Global matrix dimensions.
-        in_dtype (str): Input and decoded B element dtype; "float16" or "int8".
-        out_dtype (str): Output C dtype; one of "float16", "float32", "int32".
-        accum_dtype (str): Accumulator dtype used by MMA (e.g., "int32").
+        in_dtype (str): Input and decoded B element dtype; T.float16 or T.int8.
+        out_dtype (str): Output C dtype; one of T.float16, T.float32, T.int32.
+        accum_dtype (str): Accumulator dtype used by MMA (e.g., T.int32).
         fast_decoding (bool): If True, enable the fast decoding path (affects which device decode is used).
         block_row_warps (int): Warps in block row dimension.
         block_col_warps (int): Warps in block column dimension.
@@ -116,18 +118,18 @@ def bitnet_158_int8xint2_prefill(
         T.prim_func: A TVM prim_func implementing the described GPU kernel suitable for compilation and execution.
     """
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     num_elems_per_byte = 4
@@ -136,7 +138,7 @@ def bitnet_158_int8xint2_prefill(
     local_size_compressed = local_size // num_elems_per_byte
 
     shared_scope = "shared.dyn"
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     # Pipeline Stage
     stage = 2
@@ -181,38 +183,36 @@ def bitnet_158_int8xint2_prefill(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         """
-            GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
+        GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
 
-            This kernel:
-            - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
-            - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
-            - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
-            - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
+        This kernel:
+        - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
+        - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
+        - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
+        - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
 
-            Parameters:
-                A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
-                B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
-                C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
+        Parameters:
+            A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
+            B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
+            C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
 
-            Side effects:
-                Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
+        Side effects:
+            Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
         """
         with T.Kernel(
-                T.ceildiv(N, block_N),
-                T.ceildiv(M, block_M),
-                threads=threads,
-                prelude=decode_i2s_to_i8s,
+            T.ceildiv(N, block_N),
+            T.ceildiv(M, block_M),
+            threads=threads,
+            prelude=decode_i2s_to_i8s,
         ) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
-            B_dequantize_shared = T.alloc_shared(
-                B_dequantize_shared_shape, in_dtype, scope=shared_scope)
+            B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
             A_frag = T.alloc_local((warp_rows * fragement_size_a), in_dtype)
             B_frag = T.alloc_local((warp_cols * fragement_size_b), in_dtype)
@@ -223,10 +223,12 @@ def bitnet_158_int8xint2_prefill(
 
             thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -234,7 +236,6 @@ def bitnet_158_int8xint2_prefill(
             T.clear(C_frag)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -243,12 +244,9 @@ def bitnet_158_int8xint2_prefill(
                 for j, k in T.Parallel(block_N, block_K // num_elems_per_byte):
                     B_shared[j, k] = B[bx * block_N + j, ko * (block_K // num_elems_per_byte) + k]
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
-                        index = (
-                            i * threads * local_size_compressed +
-                            thread_bindings * local_size_compressed + v)
+                        index = i * threads * local_size_compressed + thread_bindings * local_size_compressed + v
                         vi, vj = T.index_to_coordinates(index, B_shared_shape)
                         B_local[v] = B_shared[vi, vj]
 
@@ -260,12 +258,11 @@ def bitnet_158_int8xint2_prefill(
                     )
 
                     for v in T.vectorized(0, local_size):
-                        index = (i * threads * local_size + thread_bindings * local_size + v)
+                        index = i * threads * local_size + thread_bindings * local_size + v
                         vi, vj = T.index_to_coordinates(index, B_dequantize_shared_shape)
                         B_dequantize_shared[vi, vj] = B_dequantize_local[v]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_frag,
@@ -320,12 +317,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -335,7 +332,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -343,12 +340,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -360,13 +357,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_prefill_correctness(M,
-                                                    N,
-                                                    K,
-                                                    in_dtype,
-                                                    out_dtype,
-                                                    accum_dtype,
-                                                    fast_decoding=True):
+def assert_bitnet_158_int8xint2_prefill_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_prefill(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
@@ -391,4 +382,4 @@ def assert_bitnet_158_int8xint2_prefill_correctness(M,
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
index 986463598846fafffc4fc38b35a4137e23edbeaa..e3d35df4b268e617f4d12e374b63dd51c7b3b071 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
@@ -6,7 +6,8 @@ from tvm import tl as TL
 import tvm.tl.language as T
 from bitblas.tl.utils import get_swizzle_layout
 from bitblas.tl.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from bitblas.base import simplify_prim_func
 
 torch.manual_seed(0)
@@ -37,18 +38,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -56,7 +57,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -101,12 +102,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, in_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, in_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def tl_matmul(
 
             thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -127,7 +129,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -183,7 +183,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
     print(src_code)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-7, 7, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-7, 7, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -209,12 +209,12 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
     # bitblas.testing.main()
-    # assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    # assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(16384, 16384, 16384, "int8", "int32", "int32")
+    # assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    # assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(16384, 16384, 16384, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/load_from_quantized.py b/examples/bitnet-1.58b/load_from_quantized.py
index 26a32f9747f611f26a63d2a24b35343b1e55d9f0..8c775aa4c8e819ee3ac800fce4ebe0452fac54be 100644
--- a/examples/bitnet-1.58b/load_from_quantized.py
+++ b/examples/bitnet-1.58b/load_from_quantized.py
@@ -49,7 +49,13 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 def main():
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
     # print("original model generated text:")
     # print(generate_text(model, tokenizer, "Hi, ", max_length=100))
diff --git a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
index 1e29a553abf75e4a73937a80564d33815f0b983f..2604ef38770fa58fa80cf87709e0b205eae26ecd 100644
--- a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
+++ b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
@@ -25,9 +25,9 @@ parser.add_argument("--saved_model_path", type=str, default=None)
 args = parser.parse_args()
 
 model_name_or_path = args.model_name_or_path
-saved_model_path = os.path.join(
-    dirpath, "models",
-    f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+saved_model_path = (
+    os.path.join(dirpath, "models", f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+)
 
 
 def generate_text(model, tokenizer, prompt, max_length=100):
@@ -67,7 +67,10 @@ def main():
             model_name_or_path,
             use_flash_attention_2=False,
             torch_dtype=torch.float16,
-        ).cuda().half())
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
 
     # print("original model generated text:")
@@ -112,10 +115,16 @@ def main():
         file_path = cached_file(model_name_or_path, file)
         os.system(f"cp {file_path} {saved_model_path}")
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     print("quantized model generated text:")
     print(generate_text(qmodel, tokenizer, "Hi, ", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/modeling_bitnet.py b/examples/bitnet-1.58b/modeling_bitnet.py
index 6e3c42b6f9968bb2f76e20609fd48a1d9994f171..1830995ee6177536089fe517646b290c18bb05f2 100644
--- a/examples/bitnet-1.58b/modeling_bitnet.py
+++ b/examples/bitnet-1.58b/modeling_bitnet.py
@@ -64,8 +64,7 @@ def find_layers(module, layers=None, name=""):
             return {name: module}
     res = {}
     for name1, child in module.named_children():
-        res.update(
-            find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
     return res
 
 
@@ -87,7 +86,6 @@ def _get_unpad_data(attention_mask):
 
 
 class BitnetRMSNorm(nn.Module):
-
     def __init__(self, hidden_size, eps=1e-6):
         """
         BitnetRMSNorm is equivalent to T5LayerNorm
@@ -108,34 +106,23 @@ ALL_LAYERNORM_LAYERS.append(BitnetRMSNorm)
 
 
 class BitnetRotaryEmbedding(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
         self.scaling_factor = scaling_factor
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (
-            self.base
-            **(torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
         t = t / self.scaling_factor
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer(
-            "_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
-        self.register_buffer(
-            "_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
 
     @property
     def sin_cached(self):
@@ -156,14 +143,12 @@ class BitnetRotaryEmbedding(nn.Module):
     @torch.no_grad()
     def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :,
-                                          None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type,
-                                                str) and device_type != "mps" else "cpu"
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
@@ -174,8 +159,8 @@ class BitnetRotaryEmbedding(nn.Module):
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
 
@@ -207,7 +192,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 
 
 class BitnetMLP(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -245,7 +229,6 @@ class BitnetMLP(nn.Module):
 
 
 class BitnetMLPFuseGateUp(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -272,8 +255,7 @@ class BitnetMLPFuseGateUp(nn.Module):
     def from_bit_mlp(cls, bit_mlp: BitnetMLP):
         module = cls(bit_mlp.config)
         # assign the weights
-        module.gate_up_proj.weight = nn.Parameter(
-            torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
+        module.gate_up_proj.weight = nn.Parameter(torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
         module.down_proj = bit_mlp.down_proj
         module.ffn_layernorm = bit_mlp.ffn_layernorm
         return module
@@ -295,8 +277,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                                           head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
@@ -311,7 +292,8 @@ class BitnetAttention(nn.Module):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -325,8 +307,8 @@ class BitnetAttention(nn.Module):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.q_proj = BitLinear(
             self.hidden_size,
@@ -387,10 +369,8 @@ class BitnetAttention(nn.Module):
         value_states = self.v_proj(hidden_states)
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -399,30 +379,24 @@ class BitnetAttention(nn.Module):
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -448,7 +422,8 @@ class BitnetAttentionQKVFused(nn.Module):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -462,8 +437,8 @@ class BitnetAttentionQKVFused(nn.Module):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.qkv_proj = BitLinear(
             self.hidden_size,
@@ -497,17 +472,12 @@ class BitnetAttentionQKVFused(nn.Module):
         module = cls(bit_attention.config, bit_attention.layer_idx)
         # assign the weights
         module.qkv_proj.weight = nn.Parameter(
-            torch.cat([
-                bit_attention.q_proj.weight, bit_attention.k_proj.weight,
-                bit_attention.v_proj.weight
-            ],
-                      dim=0))
+            torch.cat([bit_attention.q_proj.weight, bit_attention.k_proj.weight, bit_attention.v_proj.weight], dim=0)
+        )
         if bit_attention.q_proj.bias is not None and bit_attention.k_proj.bias is not None and bit_attention.v_proj.bias is not None:
             module.qkv_proj.bias = nn.Parameter(
-                torch.cat([
-                    bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias
-                ],
-                          dim=0))
+                torch.cat([bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias], dim=0)
+            )
         module.o_proj = bit_attention.o_proj
         module.inner_attn_ln = bit_attention.inner_attn_ln
         if bit_attention.config.rope_scaling is None:
@@ -528,16 +498,13 @@ class BitnetAttentionQKVFused(nn.Module):
         bsz, q_len, _ = hidden_states.size()
         qkv_states = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = torch.split(
-            qkv_states, [
-                self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim,
-                self.num_key_value_heads * self.head_dim
-            ],
-            dim=-1)
+            qkv_states,
+            [self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim, self.num_key_value_heads * self.head_dim],
+            dim=-1,
+        )
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -546,30 +513,24 @@ class BitnetAttentionQKVFused(nn.Module):
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -622,10 +583,8 @@ class BitnetFlashAttention2(BitnetAttention):
         # batch_size x seq_length x head_dim x hidden_dim
         # therefore we just need to keep the original shape
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -635,8 +594,7 @@ class BitnetFlashAttention2(BitnetAttention):
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
         # to be able to avoid many of these transpose/reshape/view.
@@ -665,14 +623,14 @@ class BitnetFlashAttention2(BitnetAttention):
             logger.warning_once(
                 f"The input hidden states seems to be silently casted in float32, this might be related to"
                 f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}.")
+                f" {target_dtype}."
+            )
 
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
+        attn_output = self._flash_attention_forward(query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
 
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.inner_attn_ln(attn_output)
@@ -683,14 +641,9 @@ class BitnetFlashAttention2(BitnetAttention):
 
         return attn_output, attn_weights, past_key_value
 
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
@@ -720,7 +673,8 @@ class BitnetFlashAttention2(BitnetAttention):
         if attention_mask is not None:
             batch_size = query_states.shape[0]
             query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length)
+                query_states, key_states, value_states, attention_mask, query_length
+            )
 
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -740,13 +694,7 @@ class BitnetFlashAttention2(BitnetAttention):
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
-            attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal)
+            attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal)
 
         return attn_output
 
@@ -754,28 +702,24 @@ class BitnetFlashAttention2(BitnetAttention):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
         if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
+            query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
             indices_q = indices_k
         elif query_length == 1:
             max_seqlen_in_batch_q = 1
             cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32,
-                device=query_layer.device)  # There is a memcpy here, that is very bad.
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(1)
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-                query_layer, attention_mask)
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
 
         return (
             query_layer,
@@ -794,13 +738,11 @@ LLAMA_ATTENTION_CLASSES = {
 
 
 class BitnetDecoderLayer(nn.Module):
-
     def __init__(self, config: BitnetConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
-            config=config, layer_idx=layer_idx)
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
 
         self.mlp = BitnetMLP(config)
         self.input_layernorm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -834,7 +776,8 @@ class BitnetDecoderLayer(nn.Module):
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
         residual = hidden_states
 
@@ -925,8 +868,7 @@ class BitnetPreTrainedModel(PreTrainedModel):
                 dtype = self.config._pre_quantization_dtype
             else:
                 dtype = layer.self_attn.o_proj.weight.dtype
-            layer.self_attn.past_key_value = cache_cls(
-                self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
+            layer.self_attn.past_key_value = cache_cls(self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
 
     def _reset_cache(self):
         for layer in self.model.layers:
@@ -1025,9 +967,7 @@ class BitnetModel(BitnetPreTrainedModel):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([
-            BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList([BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
         self.norm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
 
@@ -1055,21 +995,15 @@ class BitnetModel(BitnetPreTrainedModel):
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one")
 
         if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
+            logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
             use_cache = False
 
         if inputs_embeds is None:
@@ -1083,10 +1017,7 @@ class BitnetModel(BitnetPreTrainedModel):
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(
-                past_seen_tokens,
-                past_seen_tokens + inputs_embeds.shape[1],
-                device=inputs_embeds.device)
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device)
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -1143,12 +1074,9 @@ class BitnetModel(BitnetPreTrainedModel):
 
         next_cache = None
         if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache()
-                if isinstance(next_decoder_cache, Cache) else next_decoder_cache)
+            next_cache = next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                         if v is not None)
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1172,14 +1100,9 @@ class BitnetModel(BitnetPreTrainedModel):
         if hasattr(self.layers[0].self_attn, "past_key_value"):  # static cache
             target_length = self.config.max_position_embeddings
         else:  # dynamic cache
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1)
-
-        causal_mask = torch.full((sequence_length, target_length),
-                                 fill_value=min_dtype,
-                                 dtype=dtype,
-                                 device=device)
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
         if sequence_length != 1:
             causal_mask = torch.triu(causal_mask, diagonal=1)
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
@@ -1188,10 +1111,8 @@ class BitnetModel(BitnetPreTrainedModel):
             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[..., :mask_length].eq(
-                    0.0) * attention_mask[:, None, None, :].eq(0.0)
-                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(
-                    padding_mask, min_dtype)
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
             elif attention_mask.dim() == 4:
                 # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
                 # cache. In that case, the 4D attention mask attends to the newest tokens only.
@@ -1201,8 +1122,7 @@ class BitnetModel(BitnetPreTrainedModel):
                     offset = 0
                 mask_shape = attention_mask.shape
                 mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-                causal_mask[:mask_shape[0], :mask_shape[1],
-                            offset:mask_shape[2] + offset, :mask_shape[3]] = mask_slice
+                causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = mask_slice
 
         return causal_mask
 
@@ -1279,9 +1199,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
@@ -1327,13 +1245,9 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      cache_position=None,
-                                      **kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
+    ):
         # With static cache, the `past_key_values` is None
         # TODO joao: standardize interface for the different Cache classes and remove of this if
         has_static_cache = False
@@ -1344,13 +1258,13 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
         past_length = 0
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
-                past_length = cache_position[
-                    0] if cache_position is not None else past_key_values.get_seq_length()
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                 max_cache_length = (
                     torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
-                    if past_key_values.get_max_length() is not None else None)
-                cache_length = past_length if max_cache_length is None else torch.min(
-                    max_cache_length, past_length)
+                    if past_key_values.get_max_length() is not None
+                    else None
+                )
+                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
             # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
@@ -1361,7 +1275,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
@@ -1369,8 +1283,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
 
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None and
-                    cache_length + input_ids.shape[1] > max_cache_length):
+            if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
                 attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids")
@@ -1379,7 +1292,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
+                position_ids = position_ids[:, -input_ids.shape[1] :]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -1392,39 +1305,38 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
 
         input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
         if cache_position is None:
-            cache_position = torch.arange(
-                past_length, past_length + input_length, device=input_ids.device)
+            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
         else:
             cache_position = cache_position[-input_length:]
 
         if has_static_cache:
             past_key_values = None
 
-        model_inputs.update({
-            "position_ids": position_ids,
-            "cache_position": cache_position,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        })
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
 
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past),)
+            reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
         return reordered_past
 
     @staticmethod
     def recursive_set(model, name, attr):
-        '''
-            set layers.25.mlp.up_proj to attr
-        '''
+        """
+        set layers.25.mlp.up_proj to attr
+        """
 
-        names = name.split('.')
+        names = name.split(".")
         obj = model
         for n in names[:-1]:
             obj = getattr(obj, n)
@@ -1521,6 +1433,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
         fuse_gateup = quant_config.get("fuse_gateup", True)
 
         import accelerate
+
         if checkpoint_format == "bitblas":
             model = cls(config)
             for name, module in model.named_modules():
@@ -1567,7 +1480,6 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
     LLAMA_START_DOCSTRING,
 )
 class BitnetForSequenceClassification(BitnetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1631,8 +1543,7 @@ class BitnetForSequenceClassification(BitnetPreTrainedModel):
         else:
             if input_ids is not None:
                 # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids,
-                                            self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                 sequence_lengths = sequence_lengths % input_ids.shape[-1]
                 sequence_lengths = sequence_lengths.to(logits.device)
             else:
@@ -1646,8 +1557,7 @@ class BitnetForSequenceClassification(BitnetPreTrainedModel):
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or
-                                              labels.dtype == torch.int):
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
diff --git a/examples/bitnet-1.58b/tokenization_bitnet.py b/examples/bitnet-1.58b/tokenization_bitnet.py
index 6fea3252a9d85c76150888c0187b0a429d4cdfa8..2adfd6dee10e6d0fba443e14c7b828e73b378554 100644
--- a/examples/bitnet-1.58b/tokenization_bitnet.py
+++ b/examples/bitnet-1.58b/tokenization_bitnet.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for LLaMA."""
+
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
@@ -37,12 +38,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
     },
     "tokenizer_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
     },
 }
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -159,14 +158,10 @@ class BitnetTokenizer(PreTrainedTokenizer):
         **kwargs,
     ):
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(
-            bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(
-            eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(
-            unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(
-            pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
 
         if legacy is None:
             logger.warning_once(
@@ -174,7 +169,8 @@ class BitnetTokenizer(PreTrainedTokenizer):
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behavior, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565")
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
             legacy = True
 
         self.legacy = legacy
@@ -214,8 +210,7 @@ class BitnetTokenizer(PreTrainedTokenizer):
 
         with open(self.vocab_file, "rb") as f:
             sp_model = f.read()
-            model_pb2 = import_protobuf(
-                f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
+            model_pb2 = import_protobuf(f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
             model = model_pb2.ModelProto.FromString(sp_model)
             normalizer_spec = model_pb2.NormalizerSpec()
             normalizer_spec.add_dummy_prefix = False
@@ -261,8 +256,7 @@ class BitnetTokenizer(PreTrainedTokenizer):
 
         tokens = super().tokenize(text, **kwargs)
 
-        if len(tokens
-              ) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
             tokens = tokens[1:]
         return tokens
 
@@ -284,7 +278,7 @@ class BitnetTokenizer(PreTrainedTokenizer):
         # 1. Encode string + prefix ex: "<unk> Hey"
         tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
         # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length:] if len(tokens) >= self.unk_token_length else tokens
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
@@ -332,12 +326,9 @@ class BitnetTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
             return
-        out_vocab_file = os.path.join(save_directory,
-                                      (filename_prefix + "-" if filename_prefix else "") +
-                                      VOCAB_FILES_NAMES["vocab_file"])
+        out_vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"])
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
-                self.vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
         elif not os.path.isfile(self.vocab_file):
             with open(out_vocab_file, "wb") as fi:
@@ -357,10 +348,9 @@ class BitnetTokenizer(PreTrainedTokenizer):
 
         return output
 
-    def get_special_tokens_mask(self,
-                                token_ids_0: List[int],
-                                token_ids_1: Optional[List[int]] = None,
-                                already_has_special_tokens: bool = False) -> List[int]:
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
@@ -377,20 +367,16 @@ class BitnetTokenizer(PreTrainedTokenizer):
             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
+            return super().get_special_tokens_mask(token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
 
         bos_token_id = [1] if self.add_bos_token else []
         eos_token_id = [1] if self.add_eos_token else []
 
         if token_ids_1 is None:
             return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id +
-                ([0] * len(token_ids_1)) + eos_token_id)
+        return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id
 
-    def create_token_type_ids_from_sequences(self,
-                                             token_ids_0: List[int],
-                                             token_ids_1: Optional[List[int]] = None) -> List[int]:
+    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
@@ -473,9 +459,9 @@ class BitnetTokenizer(PreTrainedTokenizer):
             "{% elif message['role'] == 'assistant' %}"
             "{{ ' '  + content.strip() + ' ' + eos_token }}"
             "{% endif %}"
-            "{% endfor %}")
-        template = template.replace("USE_DEFAULT_PROMPT",
-                                    "true" if self.use_default_system_prompt else "false")
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
         default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
         template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
 
diff --git a/examples/bitnet-1.58b/utils_quant.py b/examples/bitnet-1.58b/utils_quant.py
index 5f5db5dbc04919546ac7137f6dc83c26377b56d5..5a50edb392ead6d55c9e34f19409cfb94848f13a 100644
--- a/examples/bitnet-1.58b/utils_quant.py
+++ b/examples/bitnet-1.58b/utils_quant.py
@@ -24,15 +24,14 @@ def weight_quant(weight, num_bits=1):
 def activation_quant(x, num_bits=8):
     dtype = x.dtype
     x = x.float()
-    Qn = -(2**(num_bits - 1))
-    Qp = 2**(num_bits - 1) - 1
+    Qn = -(2 ** (num_bits - 1))
+    Qp = 2 ** (num_bits - 1) - 1
     s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
     result = (x * s).round().clamp(Qn, Qp) / s
     return result.type(dtype)
 
 
 class BitLinearBitBLAS(nn.Module):
-
     def __init__(
         self,
         in_features: int,
@@ -68,7 +67,7 @@ class BitLinearBitBLAS(nn.Module):
         self.bitblas_matmul = self._get_or_create_bitblas_operator(matmul_config, ENABLE_TUNING)
 
         self.format = "bitnet"
-        self.Qp = 2**(self.input_bits - 1) - 1
+        self.Qp = 2 ** (self.input_bits - 1) - 1
 
     def _get_or_create_bitblas_operator(self, config, enable_tuning):
         if global_operator_cache.size() == 0:
@@ -99,8 +98,7 @@ class BitLinearBitBLAS(nn.Module):
 
     @classmethod
     def from_bit_linear(cls, bitlinear, weight_group=1):
-        bitblas_linear = cls(
-            bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
+        bitblas_linear = cls(bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
         sw, qweight = bitblas_linear.create_bitblas_weights(bitlinear.weight, weight_group)
         bitblas_linear.register_buffer("qweight", qweight)
         bitblas_linear.register_buffer("sw", sw)
@@ -158,8 +156,8 @@ class BitLinearBitBLAS(nn.Module):
     @torch.compile
     def activation_quant(self, x, num_bits=8):
         x = x.float()
-        Qn = -(2**(num_bits - 1))
-        Qp = 2**(num_bits - 1) - 1
+        Qn = -(2 ** (num_bits - 1))
+        Qp = 2 ** (num_bits - 1) - 1
         s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
         result = (x * s).round().clamp(Qn, Qp)
         return result.type(torch.int8), s
@@ -173,9 +171,8 @@ class BitLinearBitBLAS(nn.Module):
 
     # for the correctness evaluation.
     def native_forward(self, input):
-        quant_input = (input + (activation_quant(input, self.input_bits) - input).detach())
-        quant_weight = (
-            self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach())
+        quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
@@ -214,7 +211,6 @@ class BitLinearBitBLAS(nn.Module):
 
 # Naive BitLinear from HuggingFace
 class BitLinear(nn.Linear):
-
     def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
         super(BitLinear, self).__init__(*kargs, **kwargs)
         """
@@ -224,10 +220,8 @@ class BitLinear(nn.Linear):
         self.input_bits = input_bits
 
     def forward(self, input):
-
         quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
-        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) -
-                                      self.weight).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
diff --git a/examples/bitnet-1.58b/vllm_workspace/conftest.py b/examples/bitnet-1.58b/vllm_workspace/conftest.py
index 951f3899148e999d7409a914a0cbad0809586373..e9e2997ef67c5c22b26235d00000332dfe20910f 100644
--- a/examples/bitnet-1.58b/vllm_workspace/conftest.py
+++ b/examples/bitnet-1.58b/vllm_workspace/conftest.py
@@ -20,7 +20,7 @@ from transformers import (
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import TokenizerPoolConfig
-from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel)
+from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.sequence import SampleLogprobs
@@ -56,12 +56,13 @@ else:
 
 
 class _ImageAssets(_ImageAssetsBase):
-
     def __init__(self) -> None:
-        super().__init__([
-            ImageAsset("stop_sign"),
-            ImageAsset("cherry_blossom"),
-        ])
+        super().__init__(
+            [
+                ImageAsset("stop_sign"),
+                ImageAsset("cherry_blossom"),
+            ]
+        )
 
     def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         """
@@ -136,7 +137,6 @@ _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
 
 
 class HfRunner:
-
     def wrap_device(self, input: _T) -> _T:
         if not is_cpu():
             return input.to("cuda")
@@ -166,7 +166,8 @@ class HfRunner:
                 SentenceTransformer(
                     model_name,
                     device="cpu",
-                ).to(dtype=torch_dtype))
+                ).to(dtype=torch_dtype)
+            )
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
@@ -184,7 +185,8 @@ class HfRunner:
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
                     **model_kwargs,
-                ))
+                )
+            )
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
@@ -204,8 +206,7 @@ class HfRunner:
             )
         except Exception:
             logger.warning(
-                "Unable to auto-load processor from HuggingFace for "
-                "model %s. Using tokenizer instead.",
+                "Unable to auto-load processor from HuggingFace for model %s. Using tokenizer instead.",
                 model_name,
             )
             self.processor = self.tokenizer
@@ -362,7 +363,7 @@ class HfRunner:
                     last_hidden_states,
                     self.model.get_output_embeddings().weight.t(),
                 )
-                if (getattr(self.model.get_output_embeddings(), "bias", None) is not None):
+                if getattr(self.model.get_output_embeddings(), "bias", None) is not None:
                     logits += self.model.get_output_embeddings().bias.unsqueeze(0)
                 logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
                 seq_logprobs.append(logprobs)
@@ -389,8 +390,7 @@ class HfRunner:
             all_output_strs.append(self.tokenizer.decode(output_ids))
 
         outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
@@ -409,7 +409,6 @@ def hf_runner():
 
 
 class VllmRunner:
-
     def __init__(
         self,
         model_name: str,
@@ -514,12 +513,10 @@ class VllmRunner:
         num_logprobs: int,
         images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
-        greedy_logprobs_params = SamplingParams(
-            temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
+        greedy_logprobs_params = SamplingParams(temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
         outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params, images=images)
 
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def generate_beam_search(
         self,
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
index 55a24543e3ebaee7e7f6a1278dd86d8501dcc0c9..ea18239cbc8fc00aaf65297a77fd5db0bf27e6ac 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
@@ -32,15 +32,14 @@ args = parser.parse_args()
 
 ckpt_path = args.ckpt_path
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitblas",
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitblas",
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
-    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"],
-                                                   max_tokens=1024)
+    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=1024)
     print("bitnet inference:")
     print(bitbnet_outputs[0][0])
     print(bitbnet_outputs[0][1])
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
index 4f5f87f6ffe47ef2b8cbd9e2d2839d6b3b961cb0..f631fb306772408b17d71c35a5ae8bc1084e10d9 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
@@ -33,13 +33,13 @@ args = parser.parse_args()
 ckpt_path = args.ckpt_path
 
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitnet_bitblas",
-        gpu_memory_utilization=0.5,
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitnet_bitblas",
+    gpu_memory_utilization=0.5,
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
     bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128)
     print("bitnet inference output:")
diff --git a/examples/bitnet-1.58b/vllm_workspace/utils.py b/examples/bitnet-1.58b/vllm_workspace/utils.py
index daa9d8f52bddf6eb815e1b1ece851666bcf2a8a4..e96b19e28ca9e21af070bdd187e4b026aca26bc7 100644
--- a/examples/bitnet-1.58b/vllm_workspace/utils.py
+++ b/examples/bitnet-1.58b/vllm_workspace/utils.py
@@ -3,8 +3,7 @@ from typing import Dict, List, Tuple
 TokensText = Tuple[List[int], str]
 
 
-def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText],
-                        name_0: str, name_1: str):
+def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText], name_0: str, name_1: str):
     """
     Compare the two sequences generated by different models,
     which should be equal.
@@ -15,19 +14,14 @@ def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[Tok
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
-        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
-        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
+        assert output_str_0 == output_str_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+        assert output_ids_0 == output_ids_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
 
 TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
 
 
-def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
-                         outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
+def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs], outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
     """
     Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
@@ -41,16 +35,11 @@ def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
 
         # Loop through generated tokens.
         for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
-
             # If generated tokens don't match, then
             if output_id_0 != output_id_1:
                 # Each predicted token must be in top N logprobs of the other
-                assert output_id_0 in logprobs_1[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
-                assert output_id_1 in logprobs_0[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
+                assert output_id_0 in logprobs_1[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                assert output_id_1 in logprobs_0[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
                 # Break out since sequences will now diverge.
                 break
diff --git a/examples/blocksparse_attention/block_sparse_attn_triton.py b/examples/blocksparse_attention/block_sparse_attn_triton.py
index 014f0c5fcbce0d96abf4c374029a4f0400c57741..1794836342197de8c16bfa2eb515e872c94c663b 100644
--- a/examples/blocksparse_attention/block_sparse_attn_triton.py
+++ b/examples/blocksparse_attention/block_sparse_attn_triton.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +53,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
     # print
 
@@ -73,8 +69,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -154,7 +149,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -192,24 +187,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -254,7 +237,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -278,9 +260,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -288,9 +270,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -302,22 +282,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -329,9 +308,9 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
@@ -339,8 +318,7 @@ def test_topk_sparse_attention_qlt_kl():
     print("downsample_factor", downsample_factor)
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
     print("downsample_len", downsample_len)
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -351,26 +329,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
index 7e90db7e5f036f0ef92d2ef8f6689e270f722045..934b0b25efaac9568dac2b398d274321a803b54a 100644
--- a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
+++ b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,24 +27,25 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 1
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -59,11 +57,10 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -78,18 +75,18 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -113,22 +110,21 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def blocksparse_flashattn(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -143,7 +139,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -152,20 +148,19 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
                         MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
                         MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return blocksparse_flashattn
 
@@ -180,18 +175,16 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
@@ -202,15 +195,15 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index e299821620234fc989702c98797e3b9ec9411a14..77a29ebe284ef7df8265687bca1217166475739d 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -13,17 +13,20 @@ from heuristic import num_splits_heuristic
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages,
-                    max_num_blocks_per_seq, max_selected_blocks):
+        },
+    )
+    def kernel_func(
+        block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages, max_num_blocks_per_seq, max_selected_blocks
+    ):
         shape_q = [batch, heads, dim]
         shape_k = [num_pages, page_block_size, heads_kv, dim]
         shape_v = [num_pages, page_block_size, heads_kv, dim_v]
@@ -37,17 +40,16 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            block_table: T.Tensor(shape_block_table, T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -67,7 +69,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -75,7 +77,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 num_blocks = max_selected_blocks
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
@@ -85,30 +87,20 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         block_table_idx = T.floordiv(logical_block_idx, block_ratio)
                         block_tile_idx = T.floormod(logical_block_idx, block_ratio)
                         physical_block_idx = block_table[bid, block_table_idx]
-                        T.copy(
-                            K[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], K_shared)
+                        T.copy(K[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                             for i, j in T.Parallel(block_H, block_N):
                                 acc_s[i, j] = T.if_then_else(
-                                    logical_block_idx * block_N + j >= cache_seqlens[bid],
-                                    -T.infinity(accum_dtype), acc_s[i, j])
+                                    logical_block_idx * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j]
+                                )
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -117,10 +109,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], V_shared)
+                        T.copy(V[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -139,9 +128,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
@@ -150,19 +139,20 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 lse_logsum_local = T.alloc_local([1], accum_dtype)
                 lse_max_local = T.alloc_local([1], accum_dtype)
                 scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
+                max_split = T.alloc_local([1], T.int32)
 
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                T.annotate_layout(
+                    {
+                        lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    }
+                )
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
                 lse_max_local[0] = -T.infinity(accum_dtype)
                 for k in T.serial(num_split):
                     lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
+                    if lse_local_split[0] != 0:
                         max_split[0] = k
                         lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
 
@@ -184,18 +174,17 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            block_table: T.Tensor(shape_block_table, T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
-            flash_attn_split(Q, K, V, block_indices, cache_seqlens, block_table, glse,
-                             Output_partial)
+            flash_attn_split(Q, K, V, block_indices, cache_seqlens, block_table, glse, Output_partial)
             combine(glse, Output_partial, Output)
 
         return main
@@ -204,7 +193,6 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_pages):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -250,18 +238,11 @@ class SparseFlashAttn(torch.nn.Module):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
         output = self.kernel(
             query,
@@ -276,14 +257,13 @@ class SparseFlashAttn(torch.nn.Module):
         return output
 
 
-def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens,
-                            block_table, page_block_size, block_size):
+def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens, block_table, page_block_size, block_size):
     """
     Paged version of sparse attention reference implementation.
-    
+
     Args:
         query: [batch, heads, dim]
-        key_cache: [num_pages, page_block_size, heads_kv, dim] 
+        key_cache: [num_pages, page_block_size, heads_kv, dim]
         value_cache: [num_pages, page_block_size, heads_kv, dim]
         block_indices: [batch, heads_kv, max_selected_blocks] - logical block indices
         cache_seqlens: [batch] - actual sequence lengths
@@ -299,12 +279,8 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 
     # Reconstruct the full key and value tensors from paged cache
     max_cache_seqlen = max(cache_seqlens).item()
-    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim),
-                           dtype=key_cache.dtype,
-                           device=key_cache.device)
-    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v),
-                             dtype=value_cache.dtype,
-                             device=value_cache.device)
+    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim), dtype=key_cache.dtype, device=key_cache.device)
+    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v), dtype=value_cache.dtype, device=value_cache.device)
 
     # Reconstruct full tensors from paged cache using block_table
     for b in range(batch):
@@ -320,20 +296,14 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
             actual_block_size = end_token - start_token
 
             # Copy from paged cache to full tensors
-            key_full[b, :, start_token:end_token, :] = key_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
-            value_full[b, :, start_token:end_token, :] = value_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            key_full[b, :, start_token:end_token, :] = key_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            value_full[b, :, start_token:end_token, :] = value_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
 
     # Reshape query for grouped attention
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Compute attention scores
-    scores = einsum(
-        query, key_full,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key_full, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     # Create sparse mask based on block_indices
     sparse_mask = torch.zeros_like(scores)
@@ -349,24 +319,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
                     sparse_mask[b, :, h, start_pos:end_pos] = 1
 
     # Apply sparse mask
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
     # Apply causal mask based on actual sequence lengths
     range_len = torch.arange(scores.shape[-1], device=scores.device).unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
+    scores = scores.masked_fill(pad_mask, float("-inf"))
 
     # Compute attention weights
     attention = F.softmax(scores / scale, dim=-1)
 
     # Apply attention to values
-    out = einsum(attention, value_full,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
+    out = einsum(attention, value_full, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Reshape output back to original format
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
 
     return out
 
@@ -374,17 +343,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 def ref_program_fa(query, kcache, vcache, cache_seqlens, block_table):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
-    output = flash_attn_with_kvcache(
-        query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
+    output = flash_attn_with_kvcache(query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
     output = output.squeeze(1)
     return output
 
 
 def main(args):
-
-    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = (
+        args.batch,
+        args.heads,
+        args.heads_kv,
+        args.max_cache_seqlen,
+        args.dim,
+        args.dim_v,
+    )
     sparse_ratio = args.sparse_ratio
     block_N = args.block_N
     page_block_size = args.page_block_size
@@ -396,35 +371,30 @@ def main(args):
     dtype = torch.float16
 
     # Generate random inputs
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(
-        max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device="cuda")
     print("cache_seqlens: ", cache_seqlens)
 
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create paged KV cache
-    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device='cuda')
-    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v),
-                          dtype=dtype,
-                          device='cuda')
+    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
+    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create block table and block indices for dense case (all blocks selected)
     max_num_blocks_per_seq = int(math.ceil(max_cache_seqlen / page_block_size))
     print("max_num_blocks_per_seq: ", max_num_blocks_per_seq)
-    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device='cuda')
-    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks),
-                                dtype=torch.int32,
-                                device='cuda')
+    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device="cuda")
+    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks), dtype=torch.int32, device="cuda")
 
     # Fill block table and block indices and cache
 
     # Create a pool of available physical blocks
-    total_blocks_needed = sum(
-        int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
+    total_blocks_needed = sum(int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
     available_blocks = list(range(total_blocks_needed))
     import random
+
     random.seed(42)  # For reproducibility
     random.shuffle(available_blocks)
 
@@ -459,10 +429,8 @@ def main(args):
             actual_block_size = end_token - start_token
 
             # Copy K and V data to the paged cache
-            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx,
-                                                                      start_token:end_token, :, :]
-            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx,
-                                                                      start_token:end_token, :, :]
+            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx, start_token:end_token, :, :]
+            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx, start_token:end_token, :, :]
 
     # Fill block_indices for sparse attention
     # For dense case (verification), we select all blocks in reverse order
@@ -497,10 +465,9 @@ def main(args):
                     remaining_blocks = [b for b in all_blocks if b not in selected_blocks]
                     if remaining_blocks:
                         import random
+
                         random.seed(42)  # For reproducibility
-                        additional_blocks = random.sample(
-                            remaining_blocks,
-                            min(num_selected - recent_blocks, len(remaining_blocks)))
+                        additional_blocks = random.sample(remaining_blocks, min(num_selected - recent_blocks, len(remaining_blocks)))
                         selected_blocks.extend(additional_blocks)
 
                 # Sort selected blocks in reverse order (most recent first)
@@ -513,25 +480,20 @@ def main(args):
                     block_indices[seq_idx, head_idx, i] = -1
 
     # Initialize sparse attention module
-    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N,
-                                  num_blocks)
-    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                        block_table)
+    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_blocks)
+    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table)
 
     import flash_attn  # noqa: F401
 
-    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                               block_table, page_block_size, block_N)
+    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table, page_block_size, block_N)
 
     output_ref_fa = ref_program_fa(Q, K_cache, V_cache, cache_seqlens, block_table)
     # Check correctness
     if sparse_ratio == 0.0:
         max_diff = torch.max(torch.abs(output_sparse - output_ref_fa)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_fa)).item()
-        assert torch.allclose(
-            output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
+        assert torch.allclose(output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
     else:
-
         max_diff = torch.max(torch.abs(output_sparse - output_ref_torch)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_torch)).item()
 
@@ -575,16 +537,15 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.0, help='sparse ratio')
-    parser.add_argument('--block_N', type=int, default=64, help='block_N')
-    parser.add_argument('--page_block_size', type=int, default=256, help='block size of pages')
-    parser.add_argument('--num_pages', type=int, default=1024, help='total number of pages')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.0, help="sparse ratio")
+    parser.add_argument("--block_N", type=int, default=64, help="block_N")
+    parser.add_argument("--page_block_size", type=int, default=256, help="block size of pages")
+    parser.add_argument("--num_pages", type=int, default=1024, help="total number of pages")
     args = parser.parse_args()
     main(args)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index ae300426749dbd41f6edb9c4e34d2ad60b49e3f4..257f41543c3fc2f9d4e044d4ef9a4283edf01142 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -10,17 +10,18 @@ from heuristic import num_splits_heuristic
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen,
-                    max_selected_blocks):
+        },
+    )
+    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, max_selected_blocks):
         shape_q = [batch, heads, dim]
         shape_k = [batch, max_cache_seqlen, heads_kv, dim]
         shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
@@ -31,17 +32,16 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            # actual_num_blocks: T.Tensor([batch], T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -62,7 +62,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -70,7 +70,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 num_blocks = max_selected_blocks
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
 
@@ -78,27 +78,18 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                     i_s = block_indices[bid, cur_kv_head, start + k]
                     if i_s >= 0:
                         has_valid_block = True
-                        T.copy(K[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], K_shared)
+                        T.copy(K[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                             for i, j in T.Parallel(block_H, block_N):
-                                acc_s[i,
-                                      j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid],
-                                                          -T.infinity(accum_dtype), acc_s[i, j])
+                                acc_s[i, j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j])
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -107,7 +98,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(V[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], V_shared)
+                        T.copy(V[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -126,9 +117,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
@@ -137,19 +128,20 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 lse_logsum_local = T.alloc_local([1], accum_dtype)
                 lse_max_local = T.alloc_local([1], accum_dtype)
                 scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
+                max_split = T.alloc_local([1], T.int32)
 
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                T.annotate_layout(
+                    {
+                        lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    }
+                )
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
                 lse_max_local[0] = -T.infinity(accum_dtype)
                 for k in T.serial(num_split):
                     lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
+                    if lse_local_split[0] != 0:
                         max_split[0] = k
                         lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
 
@@ -171,15 +163,15 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            # actual_num_blocks: T.Tensor([batch], T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             # flash_attn_split(Q, K, V, block_indices, cache_seqlens, actual_num_blocks, glse, Output_partial)
             flash_attn_split(Q, K, V, block_indices, cache_seqlens, glse, Output_partial)
@@ -191,7 +183,6 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -210,7 +201,8 @@ class SparseFlashAttn(torch.nn.Module):
             num_stages=2,
             threads=128,
             max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            max_selected_blocks=T.dynamic("max_selected_blocks"))
+            max_selected_blocks=T.dynamic("max_selected_blocks"),
+        )
 
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
@@ -233,25 +225,17 @@ class SparseFlashAttn(torch.nn.Module):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
         output = self.kernel(query, key, value, block_indices, cache_seqlens, glse, output_partial)
         return output
 
 
-def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens,
-                                    max_cache_seqlen, block_size):
+def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, block_size):
     """
     Args:
         query: [batch, heads, dim]
@@ -273,31 +257,24 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
     block_H = 64
 
     actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
 
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
-
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
     kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
         block_N=block_size,
         block_H=block_H,
@@ -305,29 +282,24 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
         num_stages=2,
         threads=128,
         max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        max_selected_blocks=T.dynamic("max_selected_blocks"))
+        max_selected_blocks=T.dynamic("max_selected_blocks"),
+    )
 
     output = kernel(query, key, value, block_indices, cache_seqlens, glse, Output_partial)
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -336,28 +308,26 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
@@ -369,23 +339,13 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
     print(name + "  all_close={}".format(all_close))
     if not all_close:
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -393,10 +353,10 @@ def main(batch=8,
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # # Ensure at least one element equals cache_seqlen
     # random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
@@ -407,10 +367,7 @@ def main(batch=8,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
     # max_num_blocks = int((max_cache_seqlen + block_size - 1)/ block_size)
     # block_indices = torch.full((batch, heads_kv, max_num_blocks), -1, dtype=torch.int32, device='cuda')
 
@@ -419,10 +376,9 @@ def main(batch=8,
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
                 # valid_indices = torch.randperm(max_valid_block, device='cuda', dtype=torch.int32)[:max_num_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
@@ -435,8 +391,7 @@ def main(batch=8,
     print("max_num_blocks: ", max_num_blocks)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
@@ -446,13 +401,11 @@ def main(batch=8,
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
@@ -470,15 +423,13 @@ def main(batch=8,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index ad62817dd5bce0fbc6ebe0e45191c57355c48661..2957f8c970986e4f5f48673a7026677a10dc2b17 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -12,15 +12,17 @@ from heuristic import num_splits_heuristic
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
     def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, num_blocks):
         shape_q = [batch, heads, dim]
         shape_k = [batch, max_cache_seqlen, heads_kv, dim]
@@ -32,16 +34,15 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_mask: T.Tensor(shape_mask, T.bool),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -62,38 +63,31 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[bid, hid, start + k]:
                         has_valid_block = True
-                        T.copy(
-                            K[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            K_shared)
+                        T.copy(K[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.if_then_else((start + k) * block_N + j
-                                                         >= cache_seqlens[bx],
-                                                         -T.infinity(accum_dtype), acc_s[i, j])
+                            acc_s[i, j] = T.if_then_else(
+                                (start + k) * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j]
+                            )
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -102,9 +96,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            V_shared)
+                        T.copy(V[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -122,9 +114,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
@@ -134,10 +126,11 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 lse_max_local = T.alloc_local([1], accum_dtype)
                 scale_local = T.alloc_local([1], accum_dtype)
 
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                T.annotate_layout(
+                    {
+                        lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    }
+                )
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
@@ -160,14 +153,14 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_mask: T.Tensor(shape_mask, T.bool),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             flash_attn_split(Q, K, V, block_mask, cache_seqlens, glse, Output_partial)
             combine(glse, Output_partial, Output)
@@ -178,7 +171,6 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -197,7 +189,8 @@ class SparseFlashAttn(torch.nn.Module):
             num_stages=2,
             threads=128,
             max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            num_blocks=T.dynamic("num_blocks"))
+            num_blocks=T.dynamic("num_blocks"),
+        )
 
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
@@ -216,24 +209,16 @@ class SparseFlashAttn(torch.nn.Module):
         num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
         num_n_blocks = max_selected_blocks
 
-        size_one_kv_head = max_selected_blocks * block_size * (
-            dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+        size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
         total_mblocks = batch * heads_kv * num_m_blocks
         # num_sm = 132
         num_sm = self.num_sm
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
         # print("num_split: ", num_split)
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
         output = self.kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
         return output
 
@@ -258,26 +243,21 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
     block_H = 64
 
     actual_num_blocks = torch.sum(block_mask, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
     max_selected_blocks = actual_num_blocks.max().item()
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
         block_N=block_size,
@@ -286,11 +266,10 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
         num_stages=2,
         threads=128,
         max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        num_blocks=T.dynamic("num_blocks"))
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        num_blocks=T.dynamic("num_blocks"),
+    )
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
     # print(kernel.get_kernel_source())
 
     output = kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
@@ -298,24 +277,18 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -323,29 +296,27 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
@@ -359,23 +330,13 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
         # print(expect[3, 28])
         # print(actual[3, 28])
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -383,14 +344,13 @@ def main(batch=8,
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
 
     print("cache_seqlens: ", cache_seqlens)
@@ -402,7 +362,7 @@ def main(batch=8,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -410,13 +370,12 @@ def main(batch=8,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
     # print("block_mask: ", block_mask)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     # out = sparse_gqa_decode_varlen_mask(Q, K, V, block_mask, cache_seqlens, block_size)
     model = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = model(Q, K, V, block_mask, cache_seqlens)
@@ -426,13 +385,11 @@ def main(batch=8,
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
@@ -451,15 +408,13 @@ def main(batch=8,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
index 85b72b775e4b8d038f36931d1fd3b20274b8f07a..b61d52fa092f4d8cd115905d71cde59a99ca88dc 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
@@ -12,12 +12,8 @@ from heuristic import num_splits_heuristic
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -79,16 +75,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for i in range(loop_range):
         block_idx = tl.load(mask_ptr + (start + i) * stride_mask_s)
@@ -119,23 +110,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -163,18 +149,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -209,19 +192,13 @@ def block_sparse_flash_decode_gqa_indice_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -295,24 +272,18 @@ def block_sparse_flash_decode_gqa_indice_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     dim_v = value.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -321,42 +292,33 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -369,34 +331,29 @@ def main(batch=64,
     dtype = torch.float16
     block_H = 64
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
 
     print("cache_seqlens: ", cache_seqlens)
 
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
@@ -408,8 +365,7 @@ def main(batch=64,
     max_num_blocks = torch.max(max_valid_num_blocks).item()
     print("max_num_blocks: ", max_num_blocks)
 
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_indice_triton(
         Q,
@@ -423,8 +379,7 @@ def main(batch=64,
     )
 
     print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -466,15 +421,13 @@ def main(batch=64,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
index 348572526583b33679d54f626e1d702dde3507ca..c05b3777952fddc834cc46377a823a4c14e0e999 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
@@ -11,12 +11,8 @@ from heuristic import num_splits_heuristic
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -77,16 +73,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for block_idx in range(loop_range):
         start_n = (start + block_idx) * BLOCK_N
@@ -117,23 +108,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -161,18 +147,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -207,19 +190,13 @@ def block_sparse_flash_decode_gqa_mask_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -292,24 +269,18 @@ def block_sparse_flash_decode_gqa_mask_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -317,43 +288,34 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     block_size = block_size
     sparse_ratio = sparse_ratio
@@ -363,14 +325,13 @@ def main(batch=64,
 
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
 
     num_blocks = (max_cache_seqlen + block_size - 1) // block_size
 
@@ -379,7 +340,7 @@ def main(batch=64,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -387,11 +348,10 @@ def main(batch=64,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
 
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_mask_triton(
         Q,
@@ -404,8 +364,7 @@ def main(batch=64,
     )
 
     # print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -448,15 +407,13 @@ def main(batch=64,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/heuristic.py b/examples/blocksparse_attention/heuristic.py
index b60a81dc353aa908bf74f5a0aada4bf1a178cda1..0e6fc528196e3f111924b7d16b34d0c9af8c3800 100644
--- a/examples/blocksparse_attention/heuristic.py
+++ b/examples/blocksparse_attention/heuristic.py
@@ -1,8 +1,7 @@
 import math
 
 
-def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head,
-                         is_causal_or_local, max_splits):
+def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local, max_splits):
     """
     Determines the optimal number of splits for maximizing GPU occupancy while balancing memory efficiency.
 
diff --git a/examples/blocksparse_attention/test_example_blocksparse_attention.py b/examples/blocksparse_attention/test_example_blocksparse_attention.py
index adda1f0f15764393dbc08f138e6f492d7e14c5ec..dd33f46c4ef9705350bc2cc8894cb715d4444346 100644
--- a/examples/blocksparse_attention/test_example_blocksparse_attention.py
+++ b/examples/blocksparse_attention/test_example_blocksparse_attention.py
@@ -25,26 +25,14 @@ def test_example_tilelang_sparse_gqa_decode_varlen_mask():
 
 def test_example_triton_sparse_gqa_decode_varlen_indice():
     example_triton_sparse_gqa_decode_varlen_indice.main(
-        batch=8,
-        heads=8,
-        heads_kv=4,
-        max_cache_seqlen=2048,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=8, heads=8, heads_kv=4, max_cache_seqlen=2048, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 def test_example_triton_sparse_gqa_decode_varlen_mask():
     example_triton_sparse_gqa_decode_varlen_mask.main(
-        batch=16,
-        heads=16,
-        heads_kv=8,
-        max_cache_seqlen=1024,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=16, heads=16, heads_kv=8, max_cache_seqlen=1024, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
index 7b9cff7c12bfff7fec6c1d40d97a0a2f42dd7be0..b8a34e45de7c31f4594f76127cea577306c7554e 100644
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -19,8 +19,7 @@ parser.add_argument("--m", type=int, default=1024, help="Matrix dimension M")
 parser.add_argument("--n", type=int, default=1024, help="Matrix dimension N")
 parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
 parser.add_argument("--sparsity", type=float, default=0.5, help="Sparsity ratio (0-1)")
-parser.add_argument(
-    "--use_autotune", action="store_true", default=False, help="Whether to use autotune")
+parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune")
 
 args, _ = parser.parse_known_args()
 M, N, K = args.m, args.n, args.k
@@ -41,17 +40,19 @@ def get_configs():
     thread_num = [128, 256]
     enable_rasterization = [True, False]
 
-    _configs = list(
-        itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
+    _configs = list(itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
 
-    return [{
-        "block_M": c[0],
-        "block_N": c[1],
-        "block_K": c[2],
-        "num_stages": c[3],
-        "thread_num": c[4],
-        "enable_rasteration": c[5],
-    } for c in _configs]
+    return [
+        {
+            "block_M": c[0],
+            "block_N": c[1],
+            "block_K": c[2],
+            "num_stages": c[3],
+            "thread_num": c[4],
+            "enable_rasteration": c[5],
+        }
+        for c in _configs
+    ]
 
 
 def ref_program(A, B, BlockMask, block_M, block_N, block_K):
@@ -61,12 +62,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if BlockMask[i, j, k]:
-                    accu += (
-                        A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                            torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                               j * block_N:(j + 1) * block_N].to(torch.float32))
-            ref_c[i * block_M:(i + 1) * block_M,
-                  j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -89,28 +88,21 @@ def supply_program(params: List[KernelParam]):
     return input_tensors
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(out_idx=[-1])
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       thread_num,
-                       enable_rasteration,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+def blocksparse_matmul(
+    M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
+):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -134,7 +126,6 @@ def blocksparse_matmul(M,
 
 
 def main():
-
     # Initialize input matrices A and B on the GPU with half precision
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
@@ -147,8 +138,7 @@ def main():
 
         best_config = kernel.config
         best_latency = kernel.latency
-        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config[
-            "block_K"]
+        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config["block_K"]
 
         print(f"Best Config: {best_config}")
         print(f"Sparsity Ratio: {sparsity}")
@@ -163,10 +153,10 @@ def main():
             block_K=DEFAULT_BLOCK_K,
             num_stages=DEFAULT_NUM_STAGES,
             thread_num=DEFAULT_THREAD_NUM,
-            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
+            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION,
+        )
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
-
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
index 102ac20213b78ad011a358fc714f2ee9eb84a624..6bde50c512ad038424e99235edf7ca44abb2d853 100644
--- a/examples/cast/example_group_per_split_token_cast_to_fp8.py
+++ b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -5,8 +5,8 @@ from typing import Tuple
 from tilelang.utils.tensor import torch_assert_close
 
 # support bfloat16, float, float16
-dtype = "bfloat16"
-accum_dtype = "float"
+dtype = T.bfloat16
+accum_dtype = T.float32
 
 
 @tilelang.jit(out_idx=[2, 3])
@@ -16,11 +16,13 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
     fp8_max = 448.0
 
     @T.prim_func
-    def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor(
-        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"), X_amax: T.Tensor(
-            (BG, M_max, T.ceildiv(N, group_size)), accum_dtype)):
-        with T.Kernel(
-                T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
+    def group_per_split_token_cast(
+        X: T.Tensor((M, N), dtype),
+        batch_sizes: T.Tensor((BG,), T.int32),
+        X_fp8: T.Tensor((BG, M_max, N), T.float8_e4m3fn),
+        X_amax: T.Tensor((BG, M_max, T.ceildiv(N, group_size)), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
             row = bx
             row_g_id = by
             bg = bz
@@ -28,39 +30,35 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
             y_amax_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_s_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), accum_dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
-            row_offset = T.alloc_fragment((1,), "int32")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
+            row_offset = T.alloc_fragment((1,), T.int32)
 
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
+            T.annotate_layout(
+                {
+                    y_local: T.Fragment(y_local.shape, forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
+                }
+            )
 
             row_offset[0] = 0
             for i in T.serial(bg):
                 row_offset[0] += batch_sizes[i]
 
             T.copy(
-                X[row_offset[0] + row * blk_m:row_offset[0] + (row + 1) * blk_m,
-                  row_g_id * group_size:(row_g_id + 1) * group_size], y_local)
+                X[row_offset[0] + row * blk_m : row_offset[0] + (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size],
+                y_local,
+            )
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
-                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                              y_amax_local[i] / fp8_max, 0)
+                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_amax_local[i] / fp8_max, 0)
             for i, j in T.Parallel(blk_m, group_size):
                 y_q_local[i, j] = T.clamp(y_local[i, j] / y_s_local[i], fp8_min, fp8_max)
             T.copy(y_q_local, y_q_local_fp8)
             for i, j in T.Parallel(blk_m, group_size):
-                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                                     y_q_local[i, j], 0)
+                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_q_local[i, j], 0)
             for i in T.Parallel(blk_m):
                 X_amax[bg, row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[bg, row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[bg, row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return group_per_split_token_cast
 
@@ -127,8 +125,7 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
         return x.squeeze(0) if remove_dim else x
 
     # Normal layout requires transposing
-    aligned_x = torch.transpose(
-        torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
+    aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
     aligned_x[:, :m, :] = x
     aligned_x = aligned_x[:, :m, :]
     return aligned_x.squeeze(0) if remove_dim else aligned_x
@@ -146,15 +143,17 @@ def ref_per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
     x_fp8 = x_fp8.view(m, -1)[:, :n].contiguous()
     return x_fp8, (x_amax / 448.0).view(m, -1)
 
-def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> \
-        Tuple[torch.Tensor, torch.Tensor]:
+
+def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     # assert x.shape[0] == batch_sizes.sum()
     M_max = ceil_div(batch_sizes.max(), 128) * 128
     split_x = torch.split(x, batch_sizes.tolist(), dim=0)
     padded_x = [torch.nn.functional.pad(t, (0, 0, 0, M_max - t.shape[0])) for t in split_x]
     num_groups, m, n = batch_sizes.shape[0], M_max, x.shape[1]
-    x_fp8 = (torch.empty((num_groups, m, n), device='cuda', dtype=torch.float8_e4m3fn),
-             torch.empty((num_groups, m, n // 128), device='cuda', dtype=torch.float))
+    x_fp8 = (
+        torch.empty((num_groups, m, n), device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty((num_groups, m, n // 128), device="cuda", dtype=torch.float),
+    )
     for i in range(num_groups):
         x_fp8[0][i], x_fp8[1][i] = ref_per_token_cast_to_fp8(padded_x[i])
     x_fp8 = (x_fp8[0], get_col_major_tma_aligned_tensor(x_fp8[1]))
@@ -164,11 +163,11 @@ def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> \
 def main(M=8192, N=8192, BG=2, blk_m=8, batch_sizes=None):
     if batch_sizes is None:
         batch_sizes = [2048, 6144]
-    if dtype == "float":
+    if dtype == T.float:
         x = torch.randn(M, N, device="cuda", dtype=torch.float32)
-    elif dtype == "float16":
+    elif dtype == T.float16:
         x = torch.randn(M, N, device="cuda", dtype=torch.float16)
-    elif dtype == "bfloat16":
+    elif dtype == T.bfloat16:
         x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
     else:
         raise ValueError(f"Unsupported dtype: {dtype}")
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
index 484a092f09b76a63890f032d36522205d21f6d0f..aa6d14884039d228e2d2082662e952571622db20 100644
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
@@ -7,14 +7,15 @@ from tilelang.utils.tensor import torch_assert_close
 
 @tilelang.jit(out_idx=[1, 2])
 def per_token_cast_to_fp8(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
     group_size = 128
     fp8_min = -448.0
     fp8_max = 448.0
 
     @T.prim_func
-    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"),
-                       X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)):
+    def per_token_cast(
+        X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), T.float8_e4m3fn), X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)
+    ):
         with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (bx, by):
             row = bx
             row_g_id = by
@@ -22,18 +23,15 @@ def per_token_cast_to_fp8(M, N, blk_m):
             y_amax_local = T.alloc_fragment((blk_m,), dtype)
             y_s_local = T.alloc_fragment((blk_m,), dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
-
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
-
-            T.copy(
-                X[row * blk_m:(row + 1) * blk_m, row_g_id * group_size:(row_g_id + 1) * group_size],
-                y_local)
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
+
+            T.annotate_layout(
+                {
+                    y_local: T.Fragment(y_local.shape, forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
+                }
+            )
+
+            T.copy(X[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size], y_local)
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
@@ -43,9 +41,7 @@ def per_token_cast_to_fp8(M, N, blk_m):
             T.copy(y_q_local, y_q_local_fp8)
             for i in T.Parallel(blk_m):
                 X_amax[row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return per_token_cast
 
@@ -105,8 +101,7 @@ def main(M=8192, N=8192, blk_m=8):
     from example_triton_cast_to_fp8 import per_token_group_quant_fp8
 
     def run_triton():
-        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(
-            x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
+        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
         return x_fp8_triton_, x_amax_triton_
 
     x_fp8_triton, x_amax_triton = run_triton()
diff --git a/examples/cast/example_triton_cast_to_fp8.py b/examples/cast/example_triton_cast_to_fp8.py
index cc56defe774b0d1467a39b0d199a2c015cfbf13b..1859433f10b6f6bd438846473b5661718c34fe4f 100644
--- a/examples/cast/example_triton_cast_to_fp8.py
+++ b/examples/cast/example_triton_cast_to_fp8.py
@@ -128,9 +128,7 @@ def per_token_group_quant_fp8(
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
     """
-    assert (x.shape[-1] %
-            group_size == 0), (f"the last dimension of `x` {x.shape[-1]} must be divisible "
-                               f"by `group_size` {group_size}")
+    assert x.shape[-1] % group_size == 0, f"the last dimension of `x` {x.shape[-1]} must be divisible by `group_size` {group_size}"
     assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
     finfo = torch.finfo(dtype)
diff --git a/examples/cast/test_example_cast.py b/examples/cast/test_example_cast.py
index 1ca000eb2cf9bfa916fb952dbfc499ddb413e4ed..e8b10a7979cf6506ec93c21bc8e9d3ddec2cc214 100644
--- a/examples/cast/test_example_cast.py
+++ b/examples/cast/test_example_cast.py
@@ -4,8 +4,7 @@ import example_per_token_cast_to_fp8
 
 
 def test_example_group_per_split_token_cast_to_fp8():
-    example_group_per_split_token_cast_to_fp8.main(
-        M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
+    example_group_per_split_token_cast_to_fp8.main(M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
 
 
 def test_example_per_token_cast_to_fp8():
diff --git a/examples/compile_flags/usecase.py b/examples/compile_flags/usecase.py
deleted file mode 100644
index 8451b04fcf6ac3879ed1d98e1c6cf76143396db0..0000000000000000000000000000000000000000
--- a/examples/compile_flags/usecase.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import tilelang
-import tilelang.language as T
-
-
-# @tilelang.jit(compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
-    @T.prim_func
-    def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
-    ):
-        # Initialize Kernel Context
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, ko * block_K], A_shared)
-                T.copy(B[ko * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-M = 1024
-N = 1024
-K = 1024
-block_M = 128
-block_N = 128
-block_K = 32
-
-func = matmul(M, N, K, block_M, block_N, block_K)
-
-jit_kernel = tilelang.compile(
-    func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3 --use_fast_math --expt-relaxed-constexpr"])
-
-import torch
-
-a = torch.randn(M, K, device="cuda", dtype=torch.float16)
-b = torch.randn(K, N, device="cuda", dtype=torch.float16)
-
-c = jit_kernel(a, b)
-
-print(c)
-
-ref_c = a @ b
-
-torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-print("Kernel output matches PyTorch reference.")
diff --git a/examples/conftest.py b/examples/conftest.py
index 9f49d40a9b50e14e41915811589d0011d3c2c910..4010e0d83ae84c641151d6dd56dbf40ee42e301f 100644
--- a/examples/conftest.py
+++ b/examples/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/examples/convolution/example_convolution.py b/examples/convolution/example_convolution.py
index b2696ba8f537d2ff6638be9fa247f56034435be0..ffd3972fb081ec57e6c569f1d42e28db0caa55be 100644
--- a/examples/convolution/example_convolution.py
+++ b/examples/convolution/example_convolution.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -26,38 +25,21 @@ def ref_program(stride, padding, dilation):
 
 
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                threads,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -66,11 +48,13 @@ def convolution(N,
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
-                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
-            })
+            T.annotate_layout(
+                {
+                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                    kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+                }
+            )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -82,10 +66,8 @@ def convolution(N,
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -97,15 +79,15 @@ def convolution(N,
 
 def main(argv=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
 
     args = parser.parse_args(argv)
     N, C, H, W, F, K, S, D, P = args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p
diff --git a/examples/convolution/example_convolution_autotune.py b/examples/convolution/example_convolution_autotune.py
index 393677489b7f0fb13ea3b1d3ae688043e5847a31..59588ac4fbd40db9e1c3d45ab0ff105af28ce004 100644
--- a/examples/convolution/example_convolution_autotune.py
+++ b/examples/convolution/example_convolution_autotune.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -40,7 +39,8 @@ def get_configs():
             num_stages,
             thread_num,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -50,7 +50,8 @@ def get_configs():
             "num_stages": c[3],
             "thread_num": c[4],
             "enable_rasteration": c[5],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
@@ -64,69 +65,32 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                thread_num,
-                enable_rasteration,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(
+    N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
+):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=thread_num) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=thread_num) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -136,9 +100,11 @@ def convolution(N,
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
             if is_hopper:
-                T.annotate_layout({
-                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                })
+                T.annotate_layout(
+                    {
+                        out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    }
+                )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -150,10 +116,8 @@ def convolution(N,
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -166,17 +130,19 @@ def convolution(N,
     return main
 
 
-def main(n: int = 128,
-         c: int = 128,
-         h: int = 64,
-         w: int = 64,
-         f: int = 128,
-         k: int = 3,
-         s: int = 1,
-         d: int = 1,
-         p: int = 1,
-         use_autotune: bool = False,
-         with_roller: bool = True):
+def main(
+    n: int = 128,
+    c: int = 128,
+    h: int = 64,
+    w: int = 64,
+    f: int = 128,
+    k: int = 3,
+    s: int = 1,
+    d: int = 1,
+    p: int = 1,
+    use_autotune: bool = False,
+    with_roller: bool = True,
+):
     N, C, H, W, F, K, S, D, P = n, c, h, w, f, k, s, d, p
     ref_prog = ref_program(S, P, D)
 
@@ -196,25 +162,16 @@ def main(n: int = 128,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=True,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=True, help="Whether to enable BitBLAS roller for search space")
     args = parser.parse_args()
-    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune,
-         args.with_roller)
+    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune, args.with_roller)
diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
index 715f09a9b143f55d793f2ccbf206db9e7d666d0e..18467a811898d20813b8ed1ac6b9838fd5efe59d 100644
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -20,11 +20,11 @@ def tl_gemm(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float8_e4m3",
+        T.float8_e4m3fn,
     ], "Currently only float8_e4m3 is supported"
     assert out_dtype in [
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.float32,
     ], "Currently only float16 and float32 are supported"
 
     group_size = 128
@@ -41,18 +41,17 @@ def tl_gemm(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            scales_a: T.Tensor(Scales_A_shape, "float32"),
-            scales_b: T.Tensor(Scales_B_shape, "float32"),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        scales_a: T.Tensor(Scales_A_shape, T.float32),
+        scales_b: T.Tensor(Scales_B_shape, T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype)
-            Scale_C_shared = T.alloc_shared((block_M), "float32")
+            Scale_C_shared = T.alloc_shared((block_M), T.float32)
             C_local = T.alloc_fragment(C_shared_shape, accum_dtype)
             C_local_accum = T.alloc_fragment(C_shared_shape, accum_dtype)
 
@@ -93,21 +92,18 @@ def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     m, n = x.shape
     x_view = x.view(m, -1, 128)
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
-        m, n), (x_amax / 448.0).view(m, -1)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 
 
 def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros(
-        ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
+    x_padded = torch.zeros(ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
     x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
-        x_view.size(0), x_view.size(2))
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
 
 
 def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
@@ -127,13 +123,14 @@ def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
             c_acc.zero_()
             for k in range(ceildiv(K, 128)):
                 c = torch._scaled_mm(
-                    A_fp8[i * 128:(i + 1) * 128, k * 128:(k + 1) * 128],
-                    B_fp8[j * 128:(j + 1) * 128, k * 128:(k + 1) * 128].T,
+                    A_fp8[i * 128 : (i + 1) * 128, k * 128 : (k + 1) * 128],
+                    B_fp8[j * 128 : (j + 1) * 128, k * 128 : (k + 1) * 128].T,
                     scale_a=A_scales[i, k].view(128, 1).contiguous(),
                     scale_b=B_scales[j, k].view(1, 128).contiguous(),
-                    out_dtype=torch.bfloat16)
+                    out_dtype=torch.bfloat16,
+                )
                 c_acc += c.to(torch.float32)
-            C[i * 128:(i + 1) * 128, j * 128:(j + 1) * 128] = c_acc.to(out_dtype)
+            C[i * 128 : (i + 1) * 128, j * 128 : (j + 1) * 128] = c_acc.to(out_dtype)
     return C
 
 
@@ -179,11 +176,11 @@ def assert_tl_gemm_correctness(M, N, K, block_N, in_dtype, out_dtype, accum_dtyp
 
 
 def main():
-    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "float8_e4m3", "bfloat16", "float32")
+    assert_tl_gemm_correctness(1024, 1024, 8192, 128, T.float8_e4m3fn, T.bfloat16, T.float32)
 
 
 if __name__ == "__main__":
-    for dtype in ["float8_e4m3"]:
-        for out_dtype in ["bfloat16", "float32"]:
+    for dtype in [T.float8_e4m3fn]:
+        for out_dtype in [T.bfloat16, T.float32]:
             for block_N in [16, 32, 64, 128]:
-                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, "float32")
+                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, T.float32)
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index db460437fde6d6cfc20f3618e9a4252e8a016003..a9035793b9f305ee330185db329e615710488635 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -8,6 +8,7 @@ import argparse
 
 def get_configs():
     import itertools
+
     BLOCK_N = [16, 32, 64, 128]
     BLOCK_H = [16, 32, 64, 128]
     num_split = [1, 2, 4, 8, 16, 32]
@@ -15,43 +16,39 @@ def get_configs():
 
     _configs = list(itertools.product(BLOCK_N, BLOCK_H, num_split, threads))
 
-    return [{
-        "block_N": c[0],
-        "block_H": c[1],
-        "num_split": c[2],
-        "threads": c[3],
-    } for c in _configs]
+    return [
+        {
+            "block_N": c[0],
+            "block_H": c[1],
+            "num_split": c[2],
+            "threads": c[3],
+        }
+        for c in _configs
+    ]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashmla_decode(batch,
-                    heads,
-                    kv_head_num,
-                    seqlen_kv,
-                    dim,
-                    pe_dim,
-                    block_N,
-                    block_H,
-                    num_split,
-                    threads=128):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    },
+)
+def flashmla_decode(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, threads=128):
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=threads) as (bx, by):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
@@ -70,27 +67,24 @@ def flashmla_decode(batch,
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=0):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -105,20 +99,18 @@ def flashmla_decode(batch,
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=threads) as (bx, by, bz):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
             Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
             KV_shared = T.alloc_shared([block_N, dim], dtype)
@@ -134,8 +126,8 @@ def flashmla_decode(batch,
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -148,15 +140,12 @@ def flashmla_decode(batch,
                 T.copy(K_pe[bx, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -172,14 +161,14 @@ def flashmla_decode(batch,
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
-            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
+            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (by, bz):
             po_local = T.alloc_fragment([dim], dtype)
@@ -189,9 +178,11 @@ def flashmla_decode(batch,
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -214,26 +205,26 @@ def flashmla_decode(batch,
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -258,43 +249,36 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
-    parser.add_argument('--autotune', action='store_true', help='auto tune')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
+    parser.add_argument("--autotune", action="store_true", help="auto tune")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     enable_autotune = args.autotune
@@ -310,17 +294,7 @@ if __name__ == "__main__":
     if enable_autotune:
         kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
     else:
-        kernel = flashmla_decode(
-            batch,
-            heads,
-            kv_heads,
-            kv_ctx,
-            dim,
-            pe_dim,
-            BLOCK_N,
-            BLOCK_H,
-            num_split,
-            threads=threads)
+        kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, threads=threads)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     input_tensors = profiler._get_inputs()
     tilelang_output = kernel(*input_tensors)
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
index 0006d946875a890782e50d2bee558a0842dbfaf9..18c0a5f86d7625af022832d36f58b123c0feb0f8 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
@@ -32,8 +32,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -94,8 +93,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -141,9 +139,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -309,24 +305,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -362,14 +364,15 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
     if target not in ["flash_mla_triton"]:
@@ -377,21 +380,14 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -408,19 +404,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -429,26 +422,22 @@ available_targets = [
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
 
 
 def get_args():
@@ -470,26 +459,54 @@ if __name__ == "__main__":
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
index 644f97da15c12ddc1ccb1fd17b3be8b41a80b106..861e841c4ec8b68851cd4bfdbfdce0fede87960f 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
@@ -29,8 +29,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -91,8 +90,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -138,9 +136,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -306,24 +302,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -359,14 +361,15 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
     if target not in ["flash_mla_triton"]:
@@ -374,21 +377,14 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -405,19 +401,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -426,26 +419,22 @@ available_targets = [
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [64, 128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [64, 128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
 
 
 def get_args():
@@ -467,26 +456,54 @@ if __name__ == "__main__":
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/benchmark_mla.py b/examples/deepseek_mla/benchmark_mla.py
index a542ff611d1be481732696baaff09623ac283f2d..544b5e1285c173e1521f049e1de9521baa53afee 100644
--- a/examples/deepseek_mla/benchmark_mla.py
+++ b/examples/deepseek_mla/benchmark_mla.py
@@ -33,8 +33,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -61,8 +60,7 @@ def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
 
 
 @torch.inference_mode()
-def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     from flash_mla import flash_mla_with_kvcache, get_mla_metadata
 
     blocked_v = blocked_k[..., :dv]
@@ -87,14 +85,13 @@ def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
 
 
 @torch.inference_mode()
-def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                   h_q, h_kv, d, dv, causal, dtype):
+def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # pip install flashinfer-python
     import flashinfer
+
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     kv_indptr = [0]
     kv_indices = []
@@ -111,8 +108,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
 
-    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
-        torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
+    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
     mla_wrapper.plan(
         q_indptr,
         kv_indptr,
@@ -129,12 +125,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     )
 
     def flashinfer():
-        output, lse = mla_wrapper.run(
-            q_nope.view(-1, h_q, dv),
-            q_pe.view(-1, h_q, d - dv),
-            blocked_k_nope,
-            blocked_k_pe,
-            return_lse=True)
+        output, lse = mla_wrapper.run(q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope, blocked_k_pe, return_lse=True)
         return output.view(b, -1, h_q, dv), lse.view(b, h_q, 1)
 
     out_flash, lse_flash = flashinfer()
@@ -177,8 +168,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -224,9 +214,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -393,24 +381,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -419,13 +413,10 @@ def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size,
 
 
 @torch.inference_mode()
-def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                           cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -434,8 +425,7 @@ def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size)
 
     def flash_mla_tilelang():
         out = kernel(
@@ -486,38 +476,31 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
-    if target not in ["flashinfer", "flash_mla_triton", "tilelang"
-                     ] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
+    if target not in ["flashinfer", "flash_mla_triton", "tilelang"] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
         # flashinfer has a different lse return value
         # flash_mla_triton and flash_mla_tilelang doesn't return lse
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -534,19 +517,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -558,26 +538,22 @@ available_targets = [
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384, 32768] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384, 32768]
+    for head in [128]
+]
 
 
 def get_args():
@@ -599,26 +575,54 @@ if __name__ == "__main__":
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index 417e319fd55fe3cedc41109c31e9d61c42d2656e..0d141b4b39500f7860a3308158e5bc5c30d4fb5d 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -8,25 +8,26 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+    },
+)
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -44,36 +45,29 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             logsum = T.alloc_fragment([block_H], accum_dtype)
 
             cur_kv_head = hid // (kv_group_num // block_H)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
-                T.gemm(
-                    Q_shared,
-                    KV_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.copy(KV[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -88,20 +82,18 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=256) as (bid, hid, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=256) as (bid, hid, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -119,13 +111,15 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             cur_kv_head = hid // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -137,17 +131,13 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
                 T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -164,16 +154,15 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
+            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                            bz, :])
+            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (hid, bz):
             po_local = T.alloc_fragment([dim], dtype)
@@ -183,9 +172,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -208,26 +199,26 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -252,31 +243,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -294,10 +278,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -307,12 +290,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index fe50d4d4fd96fef836e0714be5ac9948f4913425..23001bde8a1fc7846d45f406d9e0db4d95252ece 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -8,25 +8,17 @@ import math
 
 
 @tilelang.jit(
-    out_idx=[8], pass_configs={
+    out_idx=[8],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def mla_decode_tilelang(batch,
-                        h_q,
-                        h_kv,
-                        max_seqlen_pad,
-                        dv,
-                        dpe,
-                        block_N,
-                        block_H,
-                        num_split,
-                        block_size,
-                        softmax_scale=None):
+    },
+)
+def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, block_H, num_split, block_size, softmax_scale=None):
     if softmax_scale is None:
-        softmax_scale = (dv + dpe)**-0.5
+        softmax_scale = (dv + dpe) ** -0.5
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = h_q // h_kv
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert h_kv == 1, "h_kv must be 1"
@@ -34,13 +26,13 @@ def mla_decode_tilelang(batch,
 
     @T.macro
     def flash_mla_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        CACHE_SEQLENS: T.Tensor([batch], T.int32),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
         with T.Kernel(batch, h_q // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
@@ -59,13 +51,15 @@ def mla_decode_tilelang(batch,
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -73,26 +67,20 @@ def mla_decode_tilelang(batch,
             loop_range = T.ceildiv(CACHE_SEQLENS[bx], block_N)
             for kr in T.Pipelined(loop_range, num_stages=2):
                 k = loop_range - 1 - kr
-                kv_start = BLOCK_TABLE[bx, (k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+                kv_start = BLOCK_TABLE[bx, (k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 if kr == 0:
                     for i, j in T.Parallel(block_H, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx],
-                                                     -T.infinity(accum_dtype), acc_s[i, j])
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -107,21 +95,20 @@ def mla_decode_tilelang(batch,
             for i, j in T.Parallel(block_H, dv):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     @T.macro
     def flash_mla_split_kv_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        CACHE_SEQLENS: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
     ):
-        with T.Kernel(
-                batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
+        with T.Kernel(batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, dpe], dtype)
@@ -139,13 +126,15 @@ def mla_decode_tilelang(batch,
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -153,29 +142,23 @@ def mla_decode_tilelang(batch,
             total_blocks = T.ceildiv(CACHE_SEQLENS[bx], block_N)
             blocks_per_split = T.floordiv(total_blocks, num_split)
             remaining_blocks = T.floormod(total_blocks, num_split)
-            loop_range = (blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0))
+            loop_range = blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0)
             start = (blocks_per_split * bz + T.min(bz, remaining_blocks)) * block_N
 
             for k in T.Pipelined(loop_range, num_stages=2):
-                kv_start = BLOCK_TABLE[bx, (start + k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+                kv_start = BLOCK_TABLE[bx, (start + k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx],
-                                                 -T.infinity(accum_dtype), acc_s[i, j])
+                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -192,15 +175,15 @@ def mla_decode_tilelang(batch,
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
+            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
         with T.Kernel(h_q, batch, threads=128) as (by, bz):
             po_local = T.alloc_fragment([dv], dtype)
@@ -210,9 +193,11 @@ def mla_decode_tilelang(batch,
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -235,31 +220,30 @@ def mla_decode_tilelang(batch,
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
-        flash_mla_split_kv_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, glse,
-                                  Output_partial)
+        flash_mla_split_kv_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
         flash_mla_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, Output)
 
@@ -280,8 +264,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
         s_q = query.shape[-2]
         s_k = key.shape[-2]
         attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype, device=query.device)
-        temp_mask = torch.ones(
-            s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
+        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
         attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
         attn_bias.to(query.dtype)
         attn_weight += attn_bias
@@ -291,8 +274,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # q: [b, s_q, h_q, d]
     # block_table: [b, max_seqlen_pad // block_size]
     # blocked_k: [b * max_seqlen_pad // block_size, block_size, h_kv, d]
@@ -321,13 +303,10 @@ def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
     return out_torch
 
 
-def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                     h_q, h_kv, d, dv, causal, dtype):
-
+def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -337,8 +316,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size, softmax_scale)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
 
     def flash_mla_tilelang():
@@ -356,8 +334,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
     out_flash = flash_mla_tilelang()
     t = do_bench(flash_mla_tilelang)
-    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                            cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
     torch.testing.assert_close(out_flash, out_ref, rtol=0.01, atol=0.01)
     print("All close")
     return out_flash, t
@@ -365,12 +342,12 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--h_q', type=int, default=128, help='q heads number')
-    parser.add_argument('--h_kv', type=int, default=1, help='kv heads number')
-    parser.add_argument('--cache_seqlen', type=int, default=8192, help='kv cache context length')
-    parser.add_argument('--d', type=int, default=576, help='query/key head dim, d = dv + dpe')
-    parser.add_argument('--dv', type=int, default=512, help='value head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--h_q", type=int, default=128, help="q heads number")
+    parser.add_argument("--h_kv", type=int, default=1, help="kv heads number")
+    parser.add_argument("--cache_seqlen", type=int, default=8192, help="kv cache context length")
+    parser.add_argument("--d", type=int, default=576, help="query/key head dim, d = dv + dpe")
+    parser.add_argument("--dv", type=int, default=512, help="value head dim")
     args = parser.parse_args()
     b, h_q, h_kv, cache_seqlen, d, dv = args.batch, args.h_q, args.h_kv, args.cache_seqlen, args.d, args.dv
 
@@ -379,9 +356,7 @@ if __name__ == "__main__":
 
     s_q = 1  # for decode, s_q = 1
     block_size = 64
-    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)],
-                                 dtype=torch.int32,
-                                 device=device)
+    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)], dtype=torch.int32, device=device)
     dpe = d - dv
     causal = True
 
@@ -393,12 +368,11 @@ if __name__ == "__main__":
     total_flops = s_q * total_seqlens * h_q * d * 2
 
     q = torch.randn(b, s_q, h_q, d, dtype=dtype, device=device)
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32,
-        device=device).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32, device=device).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d, dtype=dtype, device=device)
-    out_flash, latency = run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                          s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_flash, latency = run_tilelang_mla(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
diff --git a/examples/deepseek_mla/example_mla_decode_persistent.py b/examples/deepseek_mla/example_mla_decode_persistent.py
index 3f57ea0518a7bc459a13361fbd4ebb9be33ae882..b6a1300a239b57de9ac82c4f1705bea20920e873 100644
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py
@@ -9,13 +9,15 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
@@ -23,13 +25,13 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split_persistent(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(sm_num, threads=256) as (block_id):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -53,11 +55,13 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
             T.use_swizzle(10)
 
             total_tiles = batch * (heads // min(block_H, kv_group_num)) * num_split
@@ -70,8 +74,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 cur_kv_head = hid // (kv_group_num // block_H)
 
                 if bid < batch and hid * VALID_BLOCK_H < heads and sid < num_split:
-                    T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+                    T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
                     T.fill(acc_o, 0)
                     T.fill(logsum, 0)
                     T.fill(scores_max, -T.infinity(accum_dtype))
@@ -83,24 +87,15 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
                         T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            KV_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
-                        T.gemm(
-                            Q_pe_shared,
-                            K_pe_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        for i in T.Parallel(block_H):
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -115,11 +110,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         acc_o[i, j] /= logsum[i]
                     for i in T.Parallel(block_H):
                         logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, sid])
+                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid])
                     # T.copy(acc_o, O_shared)
-                    T.copy(
-                        acc_o, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                              sid, :])
+                    T.copy(acc_o, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid, :])
 
             T.sync_grid()
             waves = T.ceildiv(heads * batch, sm_num)
@@ -165,42 +158,35 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index 6554d57de423d06b6c216bd7a0d8e083c1fe3b1c..8e317fa00183e39581f460ff6159efa4aefb7d52 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -13,28 +13,33 @@ import argparse
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     sm_scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=384) as (hid, bid):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
@@ -75,16 +80,16 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -104,7 +109,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         T.barrier_wait(bar_sScale_and_sS_free, ((i_i * 2) & 1) ^ 1)
 
                     T.copy(m_i, m_i_prev)
-                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    T.reduce_max(acc_s, out=m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -137,6 +144,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -162,8 +171,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          0:dim // 2])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -193,8 +201,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          dim // 2:dim])
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -207,19 +214,17 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
@@ -229,33 +234,29 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=384) as (bid, hid, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=384) as (bid, hid, bz):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
             Q_shared_r = T.alloc_shared([block_H, dim // 2], dtype)
             Q_tail_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -294,16 +295,16 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -324,6 +325,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -356,6 +359,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -381,10 +386,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(
-                    O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, 0:dim // 2])
-                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
+                T.copy(O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, 0 : dim // 2])
+                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -414,9 +417,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(
-                    O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, dim // 2:dim])
+                T.copy(O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -425,54 +426,48 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (hid, bz):
             po_local = T.alloc_fragment([dim], dtype)
@@ -482,9 +477,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -507,26 +504,26 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -551,31 +548,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -593,10 +583,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -606,12 +595,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index 1b1447e88f89acb6d6a29e930d76e84b13f1a664..fa39fa498f552c9409dfbd313a85a985e710c054 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -8,25 +8,27 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[-1], pass_configs={
+    out_idx=[-1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    q_dtype = "float8_e4m3"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    q_dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -46,34 +48,32 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                }
+            )
+
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.disable_warp_group_reg_alloc()
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], qKV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], qKV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.copy(qKV_shared, KV_shared)
 
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -88,7 +88,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     return main_no_split
 
@@ -106,42 +106,35 @@ def ref_program(q, q_pe, kv, k_pe):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/test_example_mla_decode.py b/examples/deepseek_mla/test_example_mla_decode.py
index 66a750f7df2356ece5c2b8e842610699e892cd5c..a269ea57aed102b83596d4c7a896322fb105fbfb 100644
--- a/examples/deepseek_mla/test_example_mla_decode.py
+++ b/examples/deepseek_mla/test_example_mla_decode.py
@@ -1,5 +1,4 @@
 import tilelang.testing
-
 import example_mla_decode
 
 
diff --git a/examples/deepseek_mla/torch_refs.py b/examples/deepseek_mla/torch_refs.py
index 4b4c888cd2ac2db732948940599a845d5a663f20..aae6c7cd2b619afee90f39058cfd9a4a6a71e49e 100644
--- a/examples/deepseek_mla/torch_refs.py
+++ b/examples/deepseek_mla/torch_refs.py
@@ -11,7 +11,7 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     block_N = 64
     seqlen_kv = KV.size(1)
 
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, nheads, dim), device="cuda", dtype=torch.float)
@@ -31,18 +31,20 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bhd,bkhd->bhk', Q_,
-                                 KV_[:, (seqlen_kv // num_split) * ks +
-                                     i * block_N:(seqlen_kv // num_split) * ks +
-                                     (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bhd,bkhd->bhk",
+                Q_,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             acc_s += torch.einsum(
-                'bhd,bkhd->bhk', Q_pe_,
-                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                      (i + 1) * block_N, :, :])
+                "bhd,bkhd->bhk",
+                Q_pe_,
+                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -50,9 +52,10 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
             acc_s = torch.exp2(acc_s - scores_max[:, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bhk,bkhd->bhd', acc_s_cast,
-                KV_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                    (i + 1) * block_N, :, :])
+                "bhk,bkhd->bhd",
+                acc_s_cast,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, None]
diff --git a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
index daee39865ccbc9776be153632d417e26fee366d4..dadb4b4cb916ebc6f8ebce9120071be10a55e5e4 100644
--- a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
+++ b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
@@ -14,21 +14,44 @@ from fla.ops.utils import prepare_token_indices
 from fla.utils import autocast_custom_fwd, contiguous
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -40,20 +63,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -66,7 +87,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -87,7 +108,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -100,8 +120,7 @@ class ParallelNSAFunction(torch.autograd.Function):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -172,7 +191,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -195,7 +213,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -207,18 +226,20 @@ class ParallelNSAFunction(torch.autograd.Function):
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -258,44 +279,44 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -335,26 +356,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -364,14 +383,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -379,10 +395,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -404,71 +420,58 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
 
 def get_configs():
     import itertools
+
     iter_params = dict(
         block_T=[128, 256, 512],
         num_stages=[0, 1, 2, 4, 5],
         threads=[32, 64, 128, 256, 512],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def tilelang_sparse_attention(batch,
-                              heads,
-                              seq_len,
-                              dim,
-                              is_causal,
-                              scale=None,
-                              block_size=64,
-                              groups=1,
-                              selected_blocks=16,
-                              block_T=128,
-                              num_stages=2,
-                              threads=32):
+    }
+)
+def tilelang_sparse_attention(
+    batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16, block_T=128, num_stages=2, threads=32
+):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -476,9 +479,9 @@ def tilelang_sparse_attention(batch,
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(block_T, tilelang.math.next_power_of_2(dim))
 
@@ -493,11 +496,11 @@ def tilelang_sparse_attention(batch,
 
     @T.prim_func
     def tilelang_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -520,7 +523,7 @@ def tilelang_sparse_attention(batch,
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -530,21 +533,15 @@ def tilelang_sparse_attention(batch,
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -564,45 +561,33 @@ def tilelang_sparse_attention(batch,
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return tilelang_sparse_attention
 
 
 def generate_block_indices(batch, seq_len, heads, selected_blocks, block_size):
     """Generate random block indices for the benchmark."""
-    block_indices = torch.full((batch, seq_len, heads, selected_blocks),
-                               seq_len,
-                               dtype=torch.long,
-                               device='cuda')
+    block_indices = torch.full((batch, seq_len, heads, selected_blocks), seq_len, dtype=torch.long, device="cuda")
 
     for b in range(batch):
         for t in range(seq_len):
             for h in range(heads):
                 i_i = torch.randperm(max(1, (t // block_size)))[:selected_blocks]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
 
     return block_indices.sort(-1)[0]
 
 
-def benchmark_nsa(batch_size,
-                  seq_len,
-                  heads,
-                  head_query,
-                  dim,
-                  selected_blocks,
-                  block_size,
-                  dtype,
-                  scale,
-                  warmup=10,
-                  iterations=100,
-                  validate=False):
+def benchmark_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -628,14 +613,13 @@ def benchmark_nsa(batch_size,
     print(f"Profiler latency: {profiler_latency} ms")
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
 
     # Generate block indices
-    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks,
-                                           block_size).to(torch.int32)
+    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size).to(torch.int32)
 
     # Warmup
     for _ in range(warmup):
@@ -666,10 +650,9 @@ def benchmark_nsa(batch_size,
 
     # Validate result against reference if requested
     if validate:
-        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        block_counts = torch.randint(
-            1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
+        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
 
         ref = naive_nsa(
             q=Q,
@@ -700,22 +683,13 @@ def benchmark_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def benchmark_triton_nsa(batch_size,
-                         seq_len,
-                         heads,
-                         head_query,
-                         dim,
-                         selected_blocks,
-                         block_size,
-                         dtype,
-                         scale,
-                         warmup=10,
-                         iterations=100,
-                         validate=False):
+def benchmark_triton_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the Triton-based TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -723,18 +697,17 @@ def benchmark_triton_nsa(batch_size,
     torch.random.manual_seed(0)
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
 
     # Generate block indices
     block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size)
-    block_counts = torch.randint(
-        1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
-    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device='cuda')
+    block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
+    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device="cuda")
 
     # Warmup
     for _ in range(warmup):
@@ -750,7 +723,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
 
     # Synchronize before timing
     torch.cuda.synchronize()
@@ -770,7 +744,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
     torch.cuda.synchronize()
     end_time = time.time()
 
@@ -815,54 +790,28 @@ def benchmark_triton_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def run_benchmark_suite(impl='all'):
+def run_benchmark_suite(impl="all"):
     """Run a suite of benchmarks with different configurations."""
 
     # Define configurations to benchmark
     configs = [
         # Small model config - Note: head_query must be a multiple of heads*16 for Triton
-        {
-            "batch_size": 2,
-            "seq_len": 1024,
-            "heads": 8,
-            "head_query": 8 * 16,
-            "dim": 64,
-            "selected_blocks": 8,
-            "block_size": 32
-        },
-
+        {"batch_size": 2, "seq_len": 1024, "heads": 8, "head_query": 8 * 16, "dim": 64, "selected_blocks": 8, "block_size": 32},
         # Medium model config
-        {
-            "batch_size": 2,
-            "seq_len": 2048,
-            "heads": 16,
-            "head_query": 16 * 16,
-            "dim": 64,
-            "selected_blocks": 16,
-            "block_size": 64
-        },
-
+        {"batch_size": 2, "seq_len": 2048, "heads": 16, "head_query": 16 * 16, "dim": 64, "selected_blocks": 16, "block_size": 64},
         # Large model config
-        {
-            "batch_size": 1,
-            "seq_len": 4096,
-            "heads": 32,
-            "head_query": 32 * 16,
-            "dim": 128,
-            "selected_blocks": 32,
-            "block_size": 128
-        },
+        {"batch_size": 1, "seq_len": 4096, "heads": 32, "head_query": 32 * 16, "dim": 128, "selected_blocks": 32, "block_size": 128},
     ]
 
     results = []
     for config in configs:
         print(f"Running benchmark with config: {config}")
 
-        if impl in ['all', 'tilelang']:
+        if impl in ["all", "tilelang"]:
             print("Benchmarking TileLang implementation:")
             result = benchmark_nsa(
                 batch_size=config["batch_size"],
@@ -874,12 +823,13 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "tilelang", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all', 'triton']:
+        if impl in ["all", "triton"]:
             print("Benchmarking Triton implementation:")
             result = benchmark_triton_nsa(
                 batch_size=config["batch_size"],
@@ -891,19 +841,24 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "triton", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all']:
+        if impl in ["all"]:
             # Print comparison if both implementations were run
             tilelang_result = next(
-                r for r in results if r["impl"] == "tilelang" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "tilelang" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             triton_result = next(
-                r for r in results if r["impl"] == "triton" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "triton" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             speedup = tilelang_result["avg_time_ms"] / triton_result["avg_time_ms"]
             print(f"Speedup (Triton vs TileLang): {speedup:.2f}x")
 
@@ -921,8 +876,7 @@ if __name__ == "__main__":
     parser.add_argument("--dim", type=int, default=128, help="Head dimension")
     parser.add_argument("--selected_blocks", type=int, default=16, help="Number of selected blocks")
     parser.add_argument("--block_size", type=int, default=32, help="Block size")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", help="Data type (float16 or float32)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="Data type (float16 or float32)")
     parser.add_argument("--scale", type=float, default=0.1, help="Attention scale factor")
     parser.add_argument("--iterations", type=int, default=100, help="Number of iterations")
     parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
@@ -933,7 +887,8 @@ if __name__ == "__main__":
         type=str,
         default="all",
         choices=["tilelang", "triton", "all"],
-        help="Implementation to benchmark (tilelang, triton, or all)")
+        help="Implementation to benchmark (tilelang, triton, or all)",
+    )
 
     args = parser.parse_args()
 
@@ -941,13 +896,12 @@ if __name__ == "__main__":
     if args.impl in ["triton", "all"] and args.head_query % (args.heads * 16) != 0:
         # Adjust head_query to nearest valid value
         args.head_query = ((args.head_query // (args.heads * 16)) + 1) * (args.heads * 16)
-        print(
-            f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
+        print(f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
 
     if args.suite:
         run_benchmark_suite(impl=args.impl)
     else:
-        dtype = torch.float16 if args.dtype == "float16" else torch.float32
+        dtype = torch.float16 if args.dtype == T.float16 else torch.float32
 
         if args.impl in ["tilelang", "all"]:
             print("Benchmarking TileLang implementation:")
@@ -963,12 +917,14 @@ if __name__ == "__main__":
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (TileLang):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
@@ -986,11 +942,13 @@ if __name__ == "__main__":
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (Triton):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
index 8387d22714ef2156bff2a8a7fcbcb1473866a3ed..41f1dd86b99833d56b3164c865f55d6ae315311e 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
@@ -7,6 +7,7 @@ import torch
 import triton
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -22,7 +23,8 @@ import tilelang
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tilelang_kernel_fwd(
     batch,
     heads,
@@ -34,11 +36,10 @@ def tilelang_kernel_fwd(
     groups=1,
     selected_blocks=16,
 ):
-
     from tilelang import language as T
 
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -48,9 +49,9 @@ def tilelang_kernel_fwd(
     o_slc_shape = [batch, seq_len, heads, dim]
     lse_slc_shape = [batch, seq_len, heads]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -67,12 +68,12 @@ def tilelang_kernel_fwd(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -93,7 +94,7 @@ def tilelang_kernel_fwd(
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -103,12 +104,11 @@ def tilelang_kernel_fwd(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
-                        for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                        for k, j in T.Parallel(G, BS):
+                            acc_s[k, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
@@ -124,21 +124,21 @@ def tilelang_kernel_fwd(
                     T.copy(scores_max, scores_max_prev)
                     T.fill(scores_max, -T.infinity(accum_dtype))
                     T.reduce_max(acc_s, scores_max, dim=1, clear=True)
-                    for i in T.Parallel(G):
-                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-                    for i, j in T.Parallel(G, BS):
-                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    for k in T.Parallel(G):
+                        scores_scale[k] = T.exp2(scores_max_prev[k] * scale - scores_max[k] * scale)
+                    for k, j in T.Parallel(G, BS):
+                        acc_s[k, j] = T.exp2(acc_s[k, j] * scale - scores_max[k] * scale)
                     T.reduce_sum(acc_s, scores_sum, dim=1)
-                    for i in T.Parallel(G):
-                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    for k in T.Parallel(G):
+                        logsum[k] = logsum[k] * scores_scale[k] + scores_sum[k]
                     T.copy(acc_s, acc_s_cast)
 
                     # Rescale
-                    for i, j in T.Parallel(G, BV):
-                        acc_o[i, j] *= scores_scale[i]
+                    for k, j in T.Parallel(G, BV):
+                        acc_o[k, j] *= scores_scale[k]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
@@ -146,18 +146,20 @@ def tilelang_kernel_fwd(
             T.copy(acc_o, O_shared)
             T.copy(
                 O_shared,
-                O_slc[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV],
+                O_slc[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV],
             )
             for i in T.Parallel(G):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G:(i_h + 1) * G])
+            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G : (i_h + 1) * G])
 
     return native_sparse_attention
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dkv(
     batch,
     heads,
@@ -168,11 +170,11 @@ def tilelang_kernel_bwd_dkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -207,15 +209,15 @@ def tilelang_kernel_bwd_dkv(
 
     @T.prim_func
     def flash_bwd_dkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -238,31 +240,33 @@ def tilelang_kernel_bwd_dkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -273,7 +277,7 @@ def tilelang_kernel_bwd_dkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -282,7 +286,7 @@ def tilelang_kernel_bwd_dkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -296,7 +300,7 @@ def tilelang_kernel_bwd_dkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
                     for i, j in T.Parallel(BS, G):
                         dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
 
@@ -305,8 +309,8 @@ def tilelang_kernel_bwd_dkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dkv
 
@@ -321,9 +325,11 @@ def make_dq_layout(dQ):
     )
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dqkv(
     batch,
     heads,
@@ -334,11 +340,11 @@ def tilelang_kernel_bwd_dqkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -373,16 +379,16 @@ def tilelang_kernel_bwd_dqkv(
 
     @T.prim_func
     def flash_bwd_dqkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DQ: T.Tensor(dq_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DQ: T.Tensor(dq_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -406,31 +412,33 @@ def tilelang_kernel_bwd_dqkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -441,7 +449,7 @@ def tilelang_kernel_bwd_dqkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -450,7 +458,7 @@ def tilelang_kernel_bwd_dqkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -464,9 +472,9 @@ def tilelang_kernel_bwd_dqkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
-                    for i, j in T.Parallel(BS, G):
-                        dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
+                    for _i, _j in T.Parallel(BS, G):
+                        dsT_cast[_i, _j] = qkT[_i, _j] * (dsT[_i, _j] - delta[_j]) * sm_scale
 
                     # [BS, G] @ [G, BK] -> [BS, BK]
                     T.gemm(dsT_cast, Q_shared, dk, policy=T.GemmWarpPolicy.FullRow)
@@ -480,23 +488,25 @@ def tilelang_kernel_bwd_dqkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dqkv
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_preprocess(
     batch,
     heads,
     seq_len,
     dim,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
     blk=32,
 ):
     from tilelang import language as T
@@ -505,9 +515,9 @@ def tilelang_kernel_preprocess(
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -516,27 +526,29 @@ def tilelang_kernel_preprocess(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * blk:(by + 1) * blk, bx])
+            T.copy(delta, Delta[bz, by * blk : (by + 1) * blk, bx])
 
     return flash_bwd_prep
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_block_mask(
     batch,
     heads,
     seq_len,
     selected_blocks,
     block_size,
-    dtype="int32",
+    dtype=T.int32,
 ):
     from tilelang import language as T
 
@@ -551,9 +563,9 @@ def tilelang_kernel_block_mask(
 
     @T.prim_func
     def flash_bwd_block_mask(
-            BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
-            BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
-            BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
+        BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
+        BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
+        BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, batch, heads * S) as (bx, by, bz):
             i_t, i_b, i_hs = bx, by, bz
@@ -603,9 +615,7 @@ def parallel_nsa_bwd(
     dk = torch.empty(NV, *k.shape, dtype=k.dtype, device=q.device)
     dv = torch.empty(v.shape, dtype=v.dtype, device=q.device)
 
-    block_mask = tilelang_kernel_block_mask(B, H, T, S,
-                                            BS)(block_indices.to(torch.int32),
-                                                block_counts.to(torch.int32)).to(torch.bool)
+    block_mask = tilelang_kernel_block_mask(B, H, T, S, BS)(block_indices.to(torch.int32), block_counts.to(torch.int32)).to(torch.bool)
 
     fused_qkv_bwd_kernel = tilelang_kernel_bwd_dqkv(
         batch=B,
@@ -618,8 +628,7 @@ def parallel_nsa_bwd(
         selected_blocks=S,
         scale=scale,
     )
-    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv,
-                         block_mask.to(torch.int32))
+    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv, block_mask.to(torch.int32))
 
     dq = dq.sum(0)
     dk = dk.sum(0)
@@ -628,7 +637,6 @@ def parallel_nsa_bwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -773,23 +781,21 @@ def parallel_nsa(
             Outputs of shape `[B, SEQLEN, HQ, V]` if `head_first=False` else `[B, HQ, SEQLEN, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"),
-                                     (q, k, v, block_indices))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
         g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
             block_counts = rearrange(block_counts, "b h t -> b t h")
-    assert (q.shape[2] % (k.shape[2] * 16) == 0), "Group size must be a multiple of 16 in NSA"
+    assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
@@ -814,7 +820,7 @@ if __name__ == "__main__":
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
     block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_decode.py b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
index 58f4355094ae914c3520d4bdd2877c2a08ef58fc..b7eea58049b388ceca54c7f1883d1d5a4ab755a6 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_decode.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
@@ -16,7 +16,8 @@ tilelang.testing.set_random_seed(42)
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def native_sparse_attention(
     batch,
     heads,
@@ -25,18 +26,18 @@ def native_sparse_attention(
     scale=None,
     block_size=64,  # Tile size for attention computation
     groups=1,  # Grouped query attention (GQA) groups
-    selected_blocks=16  # Number of blocks to select per attention head
+    selected_blocks=16,  # Number of blocks to select per attention head
 ):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     # Modified shapes for inference (q has seq_len=1)a
     q_shape = [batch, 1, heads, dim]  # Changed seq_len to 1
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, 1, head_kv, selected_blocks]  # Changed seq_len to 1
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -53,12 +54,11 @@ def native_sparse_attention(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim] 
-            K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
-            V: T.Tensor(kv_shape, dtype),  # Same shape as K
-            BlockIndices: T.Tensor(block_indices_shape,
-                                   block_indices_dtype),  # Selected block indices
-            Output: T.Tensor(q_shape, dtype),  # Output attention tensor
+        Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim]
+        K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
+        V: T.Tensor(kv_shape, dtype),  # Same shape as K
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),  # Selected block indices
+        Output: T.Tensor(q_shape, dtype),  # Output attention tensor
     ):
         with T.Kernel(1, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             # Shared memory allocations for tile storage
@@ -82,7 +82,7 @@ def native_sparse_attention(
 
             NS = S
             # Copy Q for the single position
-            T.copy(Q[i_b, 0, i_h * G:(i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
+            T.copy(Q[i_b, 0, i_h * G : (i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -93,16 +93,11 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, 0, i_h, i] * BS  # Get block offset
                 if i_s >= 0:  # Skip invalid/padding blocks
                     # Load current key block to shared memory
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     # Compute QK^T attention scores
                     T.clear(acc_s)
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Online softmax with numerical stability
                     # 1. Compute max for scaling
@@ -122,15 +117,14 @@ def native_sparse_attention(
                     T.copy(acc_s, acc_s_cast)
 
                     # Accumulate attention-weighted values
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             # Final normalization and output
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]  # Normalize by logsum
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, 0, i_h * G:(i_h + 1) * G,
-                                    i_v * BV:(i_v + 1) * BV])  # Changed i_t to 0
+            T.copy(O_shared, Output[i_b, 0, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])  # Changed i_t to 0
 
     return native_sparse_attention
 
@@ -149,21 +143,21 @@ def main():
         selected_blocks=S,
     )
 
-    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
+    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
 
-    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device='cuda')
-    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda')
+    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device="cuda")
+    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda")
 
-    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN_Q):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device="cuda")
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
index f8a7ebfb0c450ce6e1dbb50a0e4cbc99772b1c08..ad36b10402429cdf24b87016c084b2790a0ba0eb 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
@@ -14,18 +14,11 @@ tilelang.testing.set_random_seed(0)
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention(batch,
-                            heads,
-                            seq_len,
-                            dim,
-                            is_causal,
-                            scale=None,
-                            block_size=64,
-                            groups=1,
-                            selected_blocks=16):
+    },
+)
+def native_sparse_attention(batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -33,9 +26,9 @@ def native_sparse_attention(batch,
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -52,11 +45,11 @@ def native_sparse_attention(batch,
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -77,7 +70,7 @@ def native_sparse_attention(batch,
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -87,21 +80,15 @@ def native_sparse_attention(batch,
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -121,13 +108,13 @@ def native_sparse_attention(batch,
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention
 
@@ -148,21 +135,22 @@ def main():
     )
     print(kernel.get_kernel_source())
     torch.random.manual_seed(0)
-    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
+                block_counts[b, t, h] = (block_indices[b, t, h] != SEQ_LEN).sum().item()
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN, H), device='cuda')
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
index d365e7a5f952377f2f326994e6d93ec48a8d0753..b52ebe42e210823de24107b9990cc111d5c8f1b3 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@ from tilelang import language as T
 import tilelang.testing
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -21,18 +22,11 @@ from einops import rearrange
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention_varlen(batch,
-                                   heads,
-                                   c_seq_len,
-                                   dim,
-                                   is_causal,
-                                   scale=None,
-                                   block_size=64,
-                                   groups=1,
-                                   selected_blocks=16):
+    }
+)
+def native_sparse_attention_varlen(batch, heads, c_seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [c_seq_len, heads, dim]
     kv_shape = [c_seq_len, head_kv, dim]
@@ -44,12 +38,12 @@ def native_sparse_attention_varlen(batch,
     block_counts_shape = [c_seq_len, head_kv]
     offsets_shape = [batch + 1]
     token_indices_shape = [c_seq_len, 2]
-    block_indices_dtype = "int32"
-    block_counts_dtype = "int32"
-    offsets_dtype = "int32"
-    token_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    block_counts_dtype = T.int32
+    offsets_dtype = T.int32
+    token_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -66,14 +60,14 @@ def native_sparse_attention_varlen(batch,
 
     @T.prim_func
     def native_sparse_attention_varlen(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
-            Offsets: T.Tensor(offsets_shape, offsets_dtype),
-            TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
+        Offsets: T.Tensor(offsets_shape, offsets_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
     ):
         with T.Kernel(c_seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -100,7 +94,7 @@ def native_sparse_attention_varlen(batch,
             current_seq_len = eos - bos
 
             NS = BlockCounts[i_t, i_h]
-            T.copy(Q[bos + i_t, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+            T.copy(Q[bos + i_t, i_h * G : (i_h + 1) * G, :BK], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -112,21 +106,15 @@ def native_sparse_attention_varlen(batch,
                     # [BS, BK]
                     # Lei: may have some padding issues
                     # we should learn from mha varlen templates to handle this
-                    T.copy(K[bos + i_s:bos + i_s + BS, i_h, :BK], K_shared)
+                    T.copy(K[bos + i_s : bos + i_s + BS, i_h, :BK], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -146,13 +134,13 @@ def native_sparse_attention_varlen(batch,
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[bos + i_s:bos + i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[bos + i_s : bos + i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, O_slc[bos + i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, O_slc[bos + i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention_varlen
 
@@ -190,17 +178,20 @@ def parallel_nsa_fwd(
 
     o_slc = torch.empty(B, C_SEQ_LEN, HQ, V, dtype=v.dtype, device=q.device)
     kernel(
-        q.view(C_SEQ_LEN, HQ, D), k.view(C_SEQ_LEN, H, D), v.view(C_SEQ_LEN, H, D),
+        q.view(C_SEQ_LEN, HQ, D),
+        k.view(C_SEQ_LEN, H, D),
+        v.view(C_SEQ_LEN, H, D),
         o_slc.view(C_SEQ_LEN, HQ, V),
         block_indices.to(torch.int32).view(C_SEQ_LEN, H, S),
-        block_counts.to(torch.int32).view(C_SEQ_LEN, H), offsets.to(torch.int32),
-        token_indices.to(torch.int32))
+        block_counts.to(torch.int32).view(C_SEQ_LEN, H),
+        offsets.to(torch.int32),
+        token_indices.to(torch.int32),
+    )
     return o_slc
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size, scale, offsets):
         ctx.dtype = q.dtype
@@ -221,22 +212,25 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         return o_slc.to(q.dtype)
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -276,29 +270,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size,
-                                      scale, cu_seqlens)
+    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         assert False, "Window size is not supported yet"
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -306,41 +298,57 @@ if __name__ == "__main__":
     N, C_SEQ_LEN, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[:N - 1]],
-        torch.tensor([C_SEQ_LEN], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [
+                torch.tensor([0], dtype=torch.long),
+                torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[: N - 1]],
+                torch.tensor([C_SEQ_LEN], dtype=torch.long),
+            ],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
 
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_k = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_v = torch.randperm(C_SEQ_LEN, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_q].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, HQ,
-                                                               D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_k].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_v].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_k = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_v = torch.randperm(C_SEQ_LEN, device="cuda")
+    q = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_q]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, HQ, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    k = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_k]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    v = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_v]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device="cuda")
     for i in range(C_SEQ_LEN):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, tilelang.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -351,7 +359,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -362,7 +371,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/example_triton_nsa_bwd.py b/examples/deepseek_nsa/example_triton_nsa_bwd.py
index e912794a458a340b9111550298eeeaac19bddc44..af05bfa701654e3ec2dd53ffb2c0b50c61514801 100644
--- a/examples/deepseek_nsa/example_triton_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_bwd.py
@@ -8,6 +8,7 @@ import triton
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@ from reference import naive_nsa
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ class ParallelNSAFunction(torch.autograd.Function):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -134,7 +154,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
@@ -199,37 +220,56 @@ def parallel_nsa_fwd(
     return o_slc, lse_slc, o_swa, lse_swa
 
 
-@triton.heuristics({'USE_OFFSETS': lambda args: args['offsets'] is not None})
+@triton.heuristics({"USE_OFFSETS": lambda args: args["offsets"] is not None})
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa, do_slc, do_swa, dk,
-                                dv, block_mask, offsets, chunk_indices, scale, T, B: tl.constexpr,
-                                H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr,
-                                V: tl.constexpr, M: tl.constexpr, BS: tl.constexpr,
-                                WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                                USE_OFFSETS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dkv(
+    q,
+    k,
+    v,
+    lse_slc,
+    lse_swa,
+    delta_slc,
+    delta_swa,
+    do_slc,
+    do_swa,
+    dk,
+    dv,
+    block_mask,
+    offsets,
+    chunk_indices,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    M: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+):
     i_v, i_s, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
-    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK),
-                            (1, 0))
-    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                            (BS, BV), (1, 0))
-    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1),
-                             (i_s * BS, 0), (BS, BK), (1, 0))
-    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                             (BS, BV), (1, 0))
+    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
+    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
 
     # [BS, BK]
     b_k = tl.load(p_k, boundary_check=(0, 1))
@@ -241,14 +281,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     for i in range(i_s * BS, T):
         b_m_slc = tl.load(block_mask + (bos + i) * H * M + i_h * M + i_s)
         if b_m_slc:
-            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                                    (1, 0))
+            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
             # [G, BK]
             b_q = tl.load(p_q, boundary_check=(0, 1))
             b_q = (b_q * scale).to(b_q.dtype)
 
-            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                         (i_h * G, i_v * BV), (G, BV), (1, 0))
+            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
             p_lse_slc = lse_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             p_delta_slc = delta_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             # [G, BV]
@@ -272,14 +310,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
         if WS > 0:
             o_s = i_s * BS + tl.arange(0, BS)
             if max(i_s * BS, i - WS + 1) < min((i_s + 1) * BS, i + 1):
-                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0),
-                                        (G, BK), (1, 0))
+                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
                 # [G, BK]
                 b_q = tl.load(p_q, boundary_check=(0, 1))
                 b_q = (b_q * scale).to(b_q.dtype)
 
-                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                             (i_h * G, i_v * BV), (G, BV), (1, 0))
+                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
                 p_lse_swa = lse_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 p_delta_swa = delta_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 # [G, BV]
@@ -304,12 +340,19 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics(
-    {'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)})
+@triton.heuristics({"USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor)})
 @triton.jit
-def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.constexpr,
-                             H: tl.constexpr, S: tl.constexpr, BS: tl.constexpr, NS: tl.constexpr,
-                             USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_kernel_mask(
+    block_indices,
+    block_counts,
+    block_mask,
+    T: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    NS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_b, i_hs = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_h, i_s = i_hs // S, i_hs % S
 
@@ -320,31 +363,56 @@ def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.cons
         b_m = b_i * BS <= i_t
 
     if b_i < NS and b_i >= 0:
-        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i,
-                 b_m.to(block_mask.dtype.element_ty))
+        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i, b_m.to(block_mask.dtype.element_ty))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, delta_swa, do_swa, dq,
-                               scale, block_indices, block_counts, offsets, token_indices, T,
-                               B: tl.constexpr, H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr,
-                               K: tl.constexpr, V: tl.constexpr, S: tl.constexpr, BS: tl.constexpr,
-                               WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                               USE_OFFSETS: tl.constexpr, USE_BLOCK_COUNTS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dq(
+    q,
+    k,
+    v,
+    lse_slc,
+    delta_slc,
+    do_slc,
+    lse_swa,
+    delta_swa,
+    do_swa,
+    dq,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -449,27 +517,49 @@ def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, del
         tl.store(p_dq, (b_dq_slc + b_dq_swa).to(p_dq.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -484,20 +574,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -510,7 +598,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -529,13 +617,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -546,7 +633,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -593,14 +680,8 @@ def parallel_nsa_block_mask(
     block_mask = torch.zeros(B, T, H, NS, dtype=torch.bool, device=block_indices.device)
 
     parallel_nsa_kernel_mask[(T, B, H * S)](
-        block_indices=block_indices,
-        block_counts=block_counts,
-        block_mask=block_mask,
-        T=T,
-        H=H,
-        S=S,
-        BS=BS,
-        NS=NS)
+        block_indices=block_indices, block_counts=block_counts, block_mask=block_mask, T=T, H=H, S=S, BS=BS, NS=NS
+    )
     return block_mask
 
 
@@ -676,7 +757,8 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dq = dq.sum(0)
 
     if offsets is not None:
@@ -719,14 +801,14 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dk = dk.sum(0)
     return dq, dk, dv
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -749,7 +831,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -781,22 +864,25 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -836,51 +922,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd.py b/examples/deepseek_nsa/example_triton_nsa_fwd.py
index 2c740013a7ca7a8f5d6ccc3a71357d2d63c1fe6e..c9ab28daaf931ebc7565130343f8e4c15a1570d2 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd.py
@@ -8,6 +8,7 @@ import triton
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@ from reference import naive_nsa
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ class ParallelNSAFunction(torch.autograd.Function):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -177,7 +197,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -200,7 +219,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -212,18 +232,20 @@ class ParallelNSAFunction(torch.autograd.Function):
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -263,51 +285,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
index 9ccbff6a4f12cd871201489d8ed75e2168fc350c..cb4eb6d7ba6119a0ebf16700d65b55b1fd1a237b 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@ import triton
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,27 +18,49 @@ from reference import naive_nsa
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -52,20 +75,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -78,7 +99,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -97,13 +118,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -114,7 +134,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -196,7 +216,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -219,7 +238,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -231,18 +251,20 @@ class ParallelNSAFunction(torch.autograd.Function):
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -282,29 +304,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -312,38 +332,35 @@ if __name__ == "__main__":
     N, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, T)[torch.randperm(T - 1)[:N - 1]],
-        torch.tensor([T], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [torch.tensor([0], dtype=torch.long), torch.arange(16, T)[torch.randperm(T - 1)[: N - 1]], torch.tensor([T], dtype=torch.long)],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
     # offsets.shape is [N+1]
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(T, device='cuda')
-    perm_k = torch.randperm(T, device='cuda')
-    perm_v = torch.randperm(T, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, T, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(T, device="cuda")
+    perm_k = torch.randperm(T, device="cuda")
+    perm_v = torch.randperm(T, device="cuda")
+    q = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
+    k = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    v = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    g_slc = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, T, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device="cuda")
     for i in range(T):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, triton.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -354,7 +371,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -365,7 +383,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/reference.py b/examples/deepseek_nsa/reference.py
index 958d0c19ee6798b234dd644b8169d27636def779..58083108eb30e871fba15b60a9f36bacee9c3949 100644
--- a/examples/deepseek_nsa/reference.py
+++ b/examples/deepseek_nsa/reference.py
@@ -6,18 +6,20 @@ from typing import Union
 from einops import rearrange, repeat
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -57,26 +59,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -86,14 +86,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -101,10 +98,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -126,34 +123,28 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
@@ -187,7 +178,7 @@ def naive_nsa_simple(
         o (torch.Tensor):
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -197,8 +188,8 @@ def naive_nsa_simple(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(v)
@@ -228,10 +219,10 @@ def naive_nsa_simple(
                     v_i[t, h] = v_b[selected_block_index, h, :]
 
             # [S*BS, HQ]
-            attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float('-inf'))
+            attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float("-inf"))
             attn = torch.softmax(attn, dim=0)
-            o[i, i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
+            o[i, i_q] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
 
@@ -265,7 +256,7 @@ def naive_nsa_simple_inference(
         o (torch.Tensor):
             Outputs of shape `[B, 1, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -275,8 +266,8 @@ def naive_nsa_simple_inference(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(q)
@@ -306,9 +297,9 @@ def naive_nsa_simple_inference(
                 v_i[t, h] = v_b[selected_block_index, h, :]
 
         # [S*BS, HQ]
-        attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-        attn = attn.masked_fill((c >= s_i), float('-inf'))
+        attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+        attn = attn.masked_fill((c >= s_i), float("-inf"))
         attn = torch.softmax(attn, dim=0)
-        o[i, 0] = torch.einsum('n h, n h v -> h v', attn, v_i)
+        o[i, 0] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
diff --git a/examples/deepseek_v32/fp8_lighting_indexer.py b/examples/deepseek_v32/fp8_lighting_indexer.py
index dd940648b66cd0834583861a39a0176be25e5261..01ad0a73469b7b7feff0a58f7918d3d144ec3c19 100644
--- a/examples/deepseek_v32/fp8_lighting_indexer.py
+++ b/examples/deepseek_v32/fp8_lighting_indexer.py
@@ -28,11 +28,11 @@ def validate_tensor_match(a, b, tolerance=1e-8, tensor_name="tensor", should_rai
         if should_raise:
             assert False
     if not torch.isclose(
-            a.masked_fill(a_finite, 0),
-            b.masked_fill(b_finite, 0),
-            rtol=0,
-            atol=0,
-            equal_nan=True,
+        a.masked_fill(a_finite, 0),
+        b.masked_fill(b_finite, 0),
+        rtol=0,
+        atol=0,
+        equal_nan=True,
     ).all():
         display_error_message(f"{tensor_name} Error: nonfinite value mismatch")
         if should_raise:
@@ -55,13 +55,10 @@ def get_configs():
         threads=[128, 256],
         block_Q=[1, 2, 4],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 class SupplyProg:
-
     def __init__(self):
         self.tensors_dict = {}
 
@@ -88,7 +85,8 @@ supply_prog = SupplyProg()
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },)
+    },
+)
 def mqa_attn_return_logits(
     heads,
     index_dim,
@@ -99,9 +97,9 @@ def mqa_attn_return_logits(
 ):
     if block_Q is None:
         block_Q = 128 // heads
-    dtype = "float8_e4m3"
-    accum_dtype = "float"
-    index_dtype = "int32"
+    dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
+    index_dtype = T.int32
 
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
@@ -113,16 +111,15 @@ def mqa_attn_return_logits(
 
     @T.prim_func
     def mqa_attn_return_logits_kernel(
-            IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
-            IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
-            IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
-            Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
-            Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
+        IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
+        IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
+        IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
+        Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
+        Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_Q), threads=threads) as bx:
-
             index_q_shared = T.alloc_shared([block_Q * heads, index_dim], dtype)
             index_k_shared = T.alloc_shared([block_N, index_dim], dtype)
             index_k_scale_fragment = T.alloc_fragment([block_N], accum_dtype)
@@ -140,17 +137,14 @@ def mqa_attn_return_logits(
             cu_k_e_max[0] = -2147483648
 
             for bq_i in T.serial(block_Q):
-                cu_k_s_min[0] = T.min(cu_k_s_min[0], T.min(CuSeqLenKS[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_s_min[0] = T.min(cu_k_s_min[0], T.min(CuSeqLenKS[seq_len_i + bq_i], seq_len_kv))
             for bq_i in T.serial(block_Q):
-                cu_k_e_max[0] = T.max(cu_k_e_max[0], T.min(CuSeqLenKE[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_e_max[0] = T.max(cu_k_e_max[0], T.min(CuSeqLenKE[seq_len_i + bq_i], seq_len_kv))
 
             T.copy(IndexQ[seq_len_i * heads, 0], index_q_shared)
             T.copy(Weights[seq_len_i, 0], weights)
 
-            for nbn_i in T.Pipelined(
-                    T.ceildiv(cu_k_e_max[0] - cu_k_s_min[0], block_N), num_stages=num_stages):
+            for nbn_i in T.Pipelined(T.ceildiv(cu_k_e_max[0] - cu_k_s_min[0], block_N), num_stages=num_stages):
                 T.copy(IndexK[cu_k_s_min[0] + nbn_i * block_N, 0], index_k_shared)
                 T.copy(IndexKScale[cu_k_s_min[0] + nbn_i * block_N], index_k_scale_fragment)
 
@@ -164,15 +158,14 @@ def mqa_attn_return_logits(
                 )
 
                 for bn_i, bq_i, h_i in T.Parallel(block_N, block_Q, heads):
-                    s_reshaped[bn_i, bq_i,
-                               h_i] = (T.max(s_reshaped[bn_i, bq_i, h_i], 0) *
-                                       weights[bq_i, h_i]) * index_k_scale_fragment[bn_i]
+                    s_reshaped[bn_i, bq_i, h_i] = (T.max(s_reshaped[bn_i, bq_i, h_i], 0) * weights[bq_i, h_i]) * index_k_scale_fragment[
+                        bn_i
+                    ]
 
                 T.reduce_sum(s_reshaped, logits, dim=-1, clear=True)
 
                 for bq_i, bn_i in T.Parallel(block_Q, block_N):
-                    Logits[seq_len_i + bq_i, cu_k_s_min[0] + nbn_i * block_N + bn_i] = (
-                        logits[bn_i, bq_i])
+                    Logits[seq_len_i + bq_i, cu_k_s_min[0] + nbn_i * block_N + bn_i] = logits[bn_i, bq_i]
 
     return mqa_attn_return_logits_kernel
 
@@ -185,14 +178,14 @@ def clean_logits_(
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
 
-    dtype = "float"
-    indices_dtype = "int32"
+    dtype = T.float
+    indices_dtype = T.int32
 
     @T.prim_func
     def clean_logits_kernel(
-            Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, threads=threads) as bx:
             tx = T.thread_binding(0, threads, thread="threadIdx.x")
@@ -210,13 +203,7 @@ def clean_logits_(
     return clean_logits_kernel
 
 
-def mqa_attn_return_logits_interface(q,
-                                     kv,
-                                     kv_scales,
-                                     weights,
-                                     cu_seqlen_ks,
-                                     cu_seqlen_ke,
-                                     clean_logits=True):
+def mqa_attn_return_logits_interface(q, kv, kv_scales, weights, cu_seqlen_ks, cu_seqlen_ke, clean_logits=True):
     seq_len, heads, index_dim = q.shape
     seq_len_kv = kv.shape[0]
 
@@ -238,20 +225,19 @@ def mqa_attn_return_logits_interface(q,
     return logits
 
 
-def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor,
-                       cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
+def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor, cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
     k = kv
     q = q.float()
     k = k.float()
 
     seq_len_kv = kv.shape[0]
-    mask_lo = torch.arange(0, seq_len_kv, device='cuda')[None, :] >= cu_seqlen_ks[:, None]
-    mask_hi = torch.arange(0, seq_len_kv, device='cuda')[None, :] < cu_seqlen_ke[:, None]
+    mask_lo = torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    mask_hi = torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
     mask = mask_lo & mask_hi
 
-    score = torch.einsum('mhd,nd->hmn', q, k)
+    score = torch.einsum("mhd,nd->hmn", q, k)
     logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
-    logits = logits.masked_fill(~mask, float('-inf'))
+    logits = logits.masked_fill(~mask, float("-inf"))
 
     cost = mask.sum()
     return logits, cost
@@ -265,32 +251,22 @@ def test_fp8_lighting_indexer(S=4096, SKV=8192, H=32, HKV=1, D=64, kv_stride=1):
     weights = torch.randn(S, H, device="cuda", dtype=torch.float32)
     p = (torch.randn(S, SKV, device="cuda", dtype=torch.float32) * 4).softmax(dim=-1)
 
-    ks, ke = generate_random_cu_seqlens(
-        per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
+    ks, ke = generate_random_cu_seqlens(per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
 
-    logits_ref, cost_ref = ref_fp8_mqa_logits(
-        q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    logits_ref, cost_ref = ref_fp8_mqa_logits(q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     q_fp8 = q.to(torch.float8_e4m3fn)
     kv_fp8, kv_scales = per_custom_dims_cast_to_fp8(kv, (0,), False)
 
-    logits_tl = mqa_attn_return_logits_interface(
-        q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
-    diff = validate_tensor_match(
-        logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
+    logits_tl = mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    diff = validate_tensor_match(logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
 
     print(f"diff: {diff}")
 
     from tilelang.profiler import do_bench
 
     def logits_fn():
-        return mqa_attn_return_logits_interface(
-            q=q_fp8,
-            kv=kv_fp8,
-            kv_scales=kv_scales,
-            weights=weights,
-            cu_seqlen_ks=ks,
-            cu_seqlen_ke=ke)
+        return mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
         logits_fn()
diff --git a/examples/deepseek_v32/inference/kernel.py b/examples/deepseek_v32/inference/kernel.py
index 26234353609710bfc7b70edc39ff06e71a531b61..25abf15d597caea97d0a890ca09a9cf73c7aa084 100644
--- a/examples/deepseek_v32/inference/kernel.py
+++ b/examples/deepseek_v32/inference/kernel.py
@@ -11,21 +11,21 @@ pass_configs = {
     tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
 }
 
-FP8 = "float8_e4m3"
-BF16 = "bfloat16"
-FP32 = "float32"
+FP8 = T.float8_e4m3fn
+BF16 = T.bfloat16
+FP32 = T.float32
 
 
 def fast_log2_ceil(x):
-    bits_x = T.reinterpret("uint32", x)
+    bits_x = T.reinterpret(T.uint32, x)
     exp_x = (bits_x >> 23) & 0xFF
     man_bits = bits_x & ((1 << 23) - 1)
-    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+    return T.Cast(T.int32, exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
 
 
 def fast_pow2(x):
     bits_x = (x + 127) << 23
-    return T.reinterpret("float32", bits_x)
+    return T.reinterpret(T.float32, bits_x)
 
 
 def fast_round_scale(amax, fp8_max_inv):
@@ -107,8 +107,8 @@ def act_quant(x: torch.Tensor,
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
-    assert out_dtype in [BF16, "float32"]
+def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype=T.float32):
+    assert out_dtype in [BF16, T.float32]
 
     M = T.dynamic("M")
     group_size = 128
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
index e7f9c60933eefcf788908d1fca6c8924c98d2cf1..d8035c1ba0e10d4aa7cfadae0d8017ba1333f097 100644
--- a/examples/deepseek_v32/sparse_mla_bwd.py
+++ b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -13,18 +13,18 @@ def preprocess(
     D,
     block_ND=32,
     num_stages=5,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     shape = [B, S, H, D]
 
     @T.prim_func
     def preprocess_kernel(
-            O: T.Tensor(shape, dtype),
-            dO: T.Tensor(shape, dtype),
-            Delta: T.Tensor([B, S, H], accum_dtype),
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([B, S, H], accum_dtype),
     ):
         with T.Kernel(H, T.ceildiv(S, block_ND), B) as (bx, by, bz):
             o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
@@ -33,16 +33,12 @@ def preprocess(
             acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
             T.clear(acc)
             for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
-                T.copy(
-                    O[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    o)
-                T.copy(
-                    dO[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    do)
+                T.copy(O[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
                 for i, j in T.Parallel(block_ND, block_ND):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * block_ND:(by + 1) * block_ND, bx])
+            T.copy(delta, Delta[bz, by * block_ND : (by + 1) * block_ND, bx])
 
     return preprocess_kernel
 
@@ -56,22 +52,22 @@ def postprocess(
     kv_group=1,
     block_N=64,
     threads=128,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     dkv_shape = [B, S_kv, kv_group, D + D_tail]
 
     @T.prim_func
     def postprocess_kernel(
-            dKV: T.Tensor(dkv_shape, accum_dtype),
-            dKV_out: T.Tensor(dkv_shape, dtype),
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, B, threads=threads) as (bx, by, bz):
             T.copy(
-                dKV[bz, bx * block_N:(bx + 1) * block_N, by, :],
-                dKV_out[bz, bx * block_N:(bx + 1) * block_N, by, :],
+                dKV[bz, bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bz, bx * block_N : (bx + 1) * block_N, by, :],
             )
 
     return postprocess_kernel
@@ -82,7 +78,9 @@ def postprocess(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+        tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
+    },
+)
 def bwd(
     B,
     S,
@@ -97,18 +95,18 @@ def bwd(
     block_size=32,
     num_stages=0,
     threads=256,
-    indices_dtype="int32",
-    dtype="bfloat16",
-    accum_dtype="float",
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert is_causal == True, 'non-casual is not supported now'
-    assert topk % block_size == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
-    assert indices_dtype == "int32"
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
 
     if sm_scale is None:
-        sm_scale = (D + D_tail)**(-0.5)
+        sm_scale = (D + D_tail) ** (-0.5)
     sm_scale_mul_reciprocal_log2 = sm_scale * 1.44269504  # log2(e)
 
     H_kv = H // kv_group
@@ -118,9 +116,9 @@ def bwd(
     indices_shape = [B, S, kv_group, topk]
     delta_shape = [B, S, H]
     lse_shape = [B, S, H]
-    assert indices_dtype == "int32"
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
 
     H = H_kv
     padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
@@ -131,14 +129,14 @@ def bwd(
 
     @T.prim_func
     def sparse_mla_bwd_kernel(
-            Q: T.Tensor(q_shape, dtype),
-            KV: T.Tensor(k_shape, dtype),
-            dO: T.Tensor(o_shape, dtype),
-            Indices: T.Tensor(indices_shape, indices_dtype),
-            Lse: T.Tensor(lse_shape, accum_dtype),
-            Delta: T.Tensor(delta_shape, accum_dtype),
-            dQ: T.Tensor(q_shape, dtype),
-            dKV: T.Tensor(k_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
     ):
         with T.Kernel(S, B, kv_group, threads=threads) as (s_i, by, bz):
             Q_shared = T.alloc_shared([padded_H, D], dtype)
@@ -159,23 +157,24 @@ def bwd(
             acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
             acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
             acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
-            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
-            acc_dkv_tail_shared = T.view(
-                KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+            acc_dkv_shared = T.alloc_shared([BS // split_store, D], accum_dtype)
+            acc_dkv_tail_shared = T.alloc_shared([BS // split_store, D_tail], accum_dtype)
 
             max_kv_i = s_i
 
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], Q_shared)
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:], Q_tail_shared)
-            T.copy(dO[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], dO_shared)
+            T.copy(Q[by, s_i, bz * padded_H : (bz + 1) * padded_H, :D], Q_shared)
+            T.copy(Q[by, s_i, bz * padded_H : (bz + 1) * padded_H, D:], Q_tail_shared)
+            T.copy(dO[by, s_i, bz * padded_H : (bz + 1) * padded_H, :D], dO_shared)
 
             T.clear(acc_dq)
             T.clear(acc_dq_tail)
 
-            T.annotate_layout({
-                dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
-                dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
+                    dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
+                }
+            )
 
             # Process each block of indices
             for i_i in T.Pipelined(NS, num_stages=num_stages):
@@ -191,62 +190,31 @@ def bwd(
                 for bi_i, d_i in T.Parallel(BS, D):
                     KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz, d_i]
 
-                T.gemm(
-                    Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for bi_i, d_i in T.Parallel(BS, D_tail):
-                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz,
-                                                   D + d_i]
-                T.gemm(
-                    Q_tail_shared,
-                    KV_tail_shared,
-                    acc_p,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 -
-                                              Lse[by, s_i, bz * padded_H + h_i])
+                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 - Lse[by, s_i, bz * padded_H + h_i])
 
                 T.copy(acc_p, P_shared_cast)
 
-                T.gemm(
-                    dO_shared,
-                    KV_shared,
-                    acc_dp,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
 
                 for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (
-                        acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
 
                 T.copy(acc_dp, dP_shared_cast)
                 T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
 
-                T.gemm(
-                    dP_shared_cast,
-                    Q_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    P_shared_cast,
-                    dO_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 T.clear(acc_dkv_tail)
-                T.gemm(
-                    dP_shared_cast,
-                    Q_tail_shared,
-                    acc_dkv_tail,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for s in range(split_store):
                     for bi_i, d_i in T.Parallel(BS, D):
@@ -255,41 +223,32 @@ def bwd(
 
                     for bi_i, d_i in T.Parallel(BS, D_tail):
                         if bi_i < BS // split_store:
-                            acc_dkv_tail_shared[bi_i,
-                                                d_i] = acc_dkv_tail[bi_i + s * (BS // split_store),
-                                                                    d_i]
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
 
                     for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, d_i * 4], acc_dkv_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4],
+                        )
 
                     # Atomically update dKV, dKV_tail tensors
                     for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, D + d_i * 4], acc_dkv_tail_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                        )
 
             # Store the accumulated dQ
             T.copy(acc_dq, dQ_shared)
             T.copy(acc_dq_tail, dQ_tail_shared)
 
-            T.copy(dQ_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D])
-            T.copy(dQ_tail_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:])
+            T.copy(dQ_shared, dQ[by, s_i, bz * padded_H : (bz + 1) * padded_H, :D])
+            T.copy(dQ_tail_shared, dQ[by, s_i, bz * padded_H : (bz + 1) * padded_H, D:])
 
     return sparse_mla_bwd_kernel
 
 
-def sparse_mla_bwd(q,
-                   kv,
-                   o,
-                   do,
-                   indices,
-                   lse,
-                   sm_scale=None,
-                   is_casual=True,
-                   return_kernel=False,
-                   delta=None):
+def sparse_mla_bwd(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
     assert q.is_contiguous()
     assert kv.is_contiguous()
     assert indices.is_contiguous()
@@ -322,6 +281,7 @@ def sparse_mla_bwd(q,
 
 def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True):
     from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
     q = q.detach().clone()
     kv = kv.detach().clone()
     q.requires_grad = True
@@ -331,30 +291,22 @@ def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_c
     return q.grad, kv.grad
 
 
-def test_sparse_mla_bwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=64,
-                        HKV=1,
-                        DQKV=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True):
+def test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True):
     # Prepare data
-    q = torch.randn((B, S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, S, H, DV), dtype=dtype, device='cuda')
+    q = torch.randn((B, S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, S, H, DV), dtype=dtype, device="cuda")
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
     # Forward
     from sparse_mla_fwd import sparse_mla_fwd_interface
+
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
 
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse)
@@ -365,13 +317,15 @@ def test_sparse_mla_bwd(B=1,
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
 
-    per_token_flop = 2 * sum([
-        H * DV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DV * topk,
-    ])
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
     from tilelang.profiler import do_bench
 
     def fn():
@@ -379,20 +333,9 @@ def test_sparse_mla_bwd(B=1,
 
     ms = do_bench(fn, rep=100, warmup=250)
     print(f"Average time: {ms:.3f} ms")
-    print(f'bwd io bandwidth = ',
-          (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'bwd tflops = ', per_token_flop * S / (ms * 1e-3) / 1e12)
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
 
 
 if __name__ == "__main__":
-    test_sparse_mla_bwd(
-        B=1,
-        S=4096,
-        SKV=8192,
-        H=64,
-        HKV=1,
-        DQKV=576,
-        DV=512,
-        topk=2048,
-        dtype=torch.bfloat16,
-        check_correctness=True)
+    test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index a39c72c40f3ec0e63ff6bd731356197cf97e2202..f9c4d2f0463da163b05832f600ccc44c66ded58a 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -25,15 +25,12 @@ def sparse_mla_fwd(
     num_stages=2,
     threads=256,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
     assert is_causal == True, "non-casual is not supported"
-    assert (topk %
-            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -47,17 +44,17 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert (
-            kv_group == 1
-        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
     D = dim
@@ -73,18 +70,17 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
-                    bx,
-                    by,
-                    bz,
-                ):
+        with T.Kernel(seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
+            bx,
+            by,
+            bz,
+        ):
             Q_shared = T.alloc_shared([H_per_block, D], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
             KV_shared = T.alloc_shared([BI, D], dtype)
@@ -118,16 +114,13 @@ def sparse_mla_fwd(
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
 
             for i_i in T.Pipelined(NI, num_stages=num_stages):
-
                 for bi_i in T.Parallel(BI):
                     mask[bi_i] = Indices[b_i, s_i, g_i, i_i * BI + bi_i] <= max_kv_i
 
                 for bi_i, d_i in T.Parallel(BI, D):
-                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                              d_i]
+                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, d_i]
                 for bi_i, d_i in T.Parallel(BI, D_tail):
-                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                                  D + d_i]
+                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
 
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
                     acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
@@ -147,6 +140,8 @@ def sparse_mla_fwd(
                 )
                 T.copy(m_i, m_i_prev)
                 T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                 for h_i in T.Parallel(H_per_block):
                     alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -174,15 +169,7 @@ def sparse_mla_fwd(
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             sm_scale=None,
-                             return_p_sum: bool = False,
-                             d_v=512,
-                             block_I=64,
-                             num_stages=2,
-                             threads=256):
+def sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=64, num_stages=2, threads=256):
     is_casual = True
     assert return_p_sum == False, "This kernel file is for fwd only"
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
@@ -199,16 +186,8 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     kernel = sparse_mla_fwd(
-        heads,
-        dim,
-        tail_dim,
-        topk,
-        kv_group,
-        sm_scale,
-        is_casual,
-        block_I=block_I,
-        num_stages=num_stages,
-        threads=threads)
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
     out, lse = kernel(q, kv, indices)
     return out, lse
 
@@ -228,14 +207,14 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     b, _, _, dim_v = v.shape
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
-            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+        1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+    ).view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :1 - 1, 0] = True
+    mask[:, :, : 1 - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -250,19 +229,21 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=128,
-                        HKV=1,
-                        DQK=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True,
-                        block_I=64,
-                        num_stages=2,
-                        threads=256):
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    SKV=8192,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
     torch.random.manual_seed(0)
     q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -272,10 +253,9 @@ def test_sparse_mla_fwd(B=1,
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    tl_out, tl_lse = sparse_mla_fwd_interface(
-        q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     if check_correctness:
         # otherwise may cause out of memory
@@ -284,8 +264,7 @@ def test_sparse_mla_fwd(B=1,
         print("assert_tensors_similar passed")
 
     def fn():
-        return sparse_mla_fwd_interface(
-            q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+        return sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     from tilelang.profiler import do_bench
 
@@ -313,4 +292,5 @@ if __name__ == "__main__":
         check_correctness=True,
         block_I=64,
         num_stages=2,
-        threads=256)
+        threads=256,
+    )
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 96dda7df57544e01e020437883933e221ff3e542..54e1a72090152dbb0d1e325784774f4f42efb5a9 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -9,10 +9,16 @@ import argparse
 @tilelang.jit(
     out_idx=[-2, -1],
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
 def sparse_mla_fwd(
@@ -32,14 +38,12 @@ def sparse_mla_fwd(
     num_stages=0,
     threads=384,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
-    assert is_causal == True, 'non-casual is not supported'
-    assert topk % block_I == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -49,23 +53,25 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert kv_group == 1, 'here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)'
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
-    assert NI % 2 == 0, 'NI should be a multiple of 2'
+    assert NI % 2 == 0, "NI should be a multiple of 2"
     D = dim
     D_tail = tail_dim
     KV_stride = kv_stride
     if head_kv > 64:
-        assert head_kv % 64 == 0, 'head_kv should be a multiple of 64'
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
         REPLICATE_H = head_kv // 64
     else:
         REPLICATE_H = 1
@@ -74,18 +80,14 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            q_start_index_s: T.Tensor(1, indices_dtype),
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        q_start_index_s: T.Tensor(1, indices_dtype),
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-            (seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H,
-                batch,
-                kv_group,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel((seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H, batch, kv_group, threads=threads) as (bx, by, bz):
             Q_shared_l = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_shared_r = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
@@ -122,8 +124,7 @@ def sparse_mla_fwd(
             bar_sScale_and_sS_free = T.alloc_barrier(arrive_count=256)
 
             b_i, g_i = by, bz
-            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (
-                bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
+            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
             q_i = q_start_index_s[0] + s_i
             max_kv_i = (q_i + 1 - KV_stride) // KV_stride
 
@@ -132,26 +133,24 @@ def sparse_mla_fwd(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[b_i, s_i, H0:H1, 0:D // 2], Q_shared_l)
-            T.copy(Q[b_i, s_i, H0:H1, D // 2:D], Q_shared_r)
+            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
+            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
                 for i_i in T.serial(T.ceildiv(NI, 2)):
-
                     # Buffer 0
                     T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True, wg_wait=-1)
@@ -164,6 +163,8 @@ def sparse_mla_fwd(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -185,8 +186,7 @@ def sparse_mla_fwd(
                     T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True, wg_wait=-1)
@@ -198,6 +198,8 @@ def sparse_mla_fwd(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -223,7 +225,7 @@ def sparse_mla_fwd(
                 for h_i in T.Parallel(H_per_block):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0:D // 2])
+                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0 : D // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -253,7 +255,7 @@ def sparse_mla_fwd(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2:D])
+                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2 : D])
             elif tx >= 256:
                 # producer
                 T.set_max_nreg(80, 0)
@@ -261,70 +263,58 @@ def sparse_mla_fwd(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
+                        indices_local[0] = Indices[b_i, s_i, g_i, (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
                         is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local[0], g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
+                        indices_local[0] = Indices[b_i, s_i, g_i, (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
                         is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local[0], g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             q_start_index_s,
-                             kv_stride,
-                             sm_scale=None,
-                             is_casual=True,
-                             return_kernel=False,
-                             print_kernel=False):
+def sparse_mla_fwd_interface(
+    q, kv, indices, q_start_index_s, kv_stride, sm_scale=None, is_casual=True, return_kernel=False, print_kernel=False
+):
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
     batch, seq_len, heads, dim_plus_tail_dim = q.shape
     _, seq_len_kv, kv_group, _ = kv.shape
 
-    assert dim_plus_tail_dim == 576, 'you should assign dim otherwise'
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
     dim = 512
 
     assert kv.shape[-1] == dim_plus_tail_dim
@@ -334,29 +324,23 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     if q_start_index_s != 0:
-        assert q_start_index_s > kv_stride, "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        assert q_start_index_s > kv_stride, (
+            "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        )
     CP0 = q_start_index_s == 0
 
-    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride,
-                            kv_group, sm_scale, is_casual, CP0)
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride, kv_group, sm_scale, is_casual, CP0)
     if print_kernel:
         print(kernel.get_kernel_source())
-    out, lse = kernel(q, kv, indices,
-                      torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
+    out, lse = kernel(q, kv, indices, torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
     if return_kernel:
         return kernel
     if q_start_index_s == 0 and kv_stride > 1:
-        out[:, :kv_stride - 1, :, :] = 0
+        out[:, : kv_stride - 1, :, :] = 0
     return out, lse
 
 
-def ref_sparse_mla_fwd_interface(q,
-                                 kv,
-                                 indices,
-                                 q_start_index_s,
-                                 kv_stride=4,
-                                 sm_scale=None,
-                                 is_casual=True):
+def ref_sparse_mla_fwd_interface(q, kv, indices, q_start_index_s, kv_stride=4, sm_scale=None, is_casual=True):
     q = q.float()
     kv = kv.float()
     indices = indices.transpose(1, 2)
@@ -365,7 +349,7 @@ def ref_sparse_mla_fwd_interface(q,
     if q_start_index_s is None:
         q_start_index_s = sk * kv_stride - sq
 
-    assert kv.shape[-1] == 576, 'you should assign dim otherwise'
+    assert kv.shape[-1] == 576, "you should assign dim otherwise"
     dim = 512
     k = kv
     v = kv[..., :dim]
@@ -374,15 +358,14 @@ def ref_sparse_mla_fwd_interface(q,
     num_kv_per_index = 1
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        q_start_index_s, sq + q_start_index_s, dtype=torch.int32,
-        device="cuda").view(-1, 1) >= torch.arange(
-            kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(q_start_index_s, sq + q_start_index_s, dtype=torch.int32, device="cuda").view(
+        -1, 1
+    ) >= torch.arange(kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :kv_stride - 1, 0] = True
+    mask[:, :, : kv_stride - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -397,41 +380,32 @@ def ref_sparse_mla_fwd_interface(q,
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd_pipelined(B=1,
-                                  S=4096,
-                                  SKV=8192,
-                                  H=128,
-                                  HKV=1,
-                                  DQK=576,
-                                  DV=512,
-                                  topk=2048,
-                                  dtype=torch.bfloat16,
-                                  q_start_s_index=1024,
-                                  check_correctness=True):
+def test_sparse_mla_fwd_pipelined(
+    B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, q_start_s_index=1024, check_correctness=True
+):
     KV_stride = 1
 
     torch.random.manual_seed(0)
-    q = torch.randn((B, S, H, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
-    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
     q_start_s_index_t = torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda")
 
     q.clamp_(-10, 10)
     kv.clamp_(-10, 10)
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    kernel = sparse_mla_fwd_interface(
-        q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
+    kernel = sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
 
     def fn():
         out, lse = kernel(q, kv, indices, q_start_s_index_t)
         if q_start_s_index == 0 and KV_stride > 1:
-            out[:, :KV_stride - 1, :, :] = 0
+            out[:, : KV_stride - 1, :, :] = 0
         return out, lse
 
     tl_out, tl_lse = fn()
@@ -442,14 +416,15 @@ def test_sparse_mla_fwd_pipelined(B=1,
     torch.testing.assert_close(tl_out, ref_out, rtol=1e-3, atol=1e-3)
 
     from tilelang.profiler import do_bench
+
     ms = do_bench(
         fn,
         rep=10,
         warmup=10,
     )
     print(f"Average time: {ms:.3f} ms")
-    print(f'fwd io bandwidth = ', (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'fwd tflops = ', (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+    print(f"fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
 
 
 if __name__ == "__main__":
@@ -460,5 +435,4 @@ if __name__ == "__main__":
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
     else:
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
-    test_sparse_mla_fwd_pipelined(
-        B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
+    test_sparse_mla_fwd_pipelined(B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
index e10141b59039440ecaf21fb22db977fc875f59e0..6b7e879ba6e4fc1155660ad6be10da917c7a5ad5 100644
--- a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
+++ b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -1,4 +1,5 @@
 # ruff: noqa
+import tilelang
 import tilelang.testing
 
 import topk_selector
@@ -20,23 +21,20 @@ def test_example_fp8_lighting_indexer():
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    sparse_mla_fwd.test_sparse_mla_fwd(
-        S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd.test_sparse_mla_fwd(S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(
-        S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
-    sparse_mla_bwd.test_sparse_mla_bwd(
-        S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_bwd.test_sparse_mla_bwd(S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
 
 
 if __name__ == "__main__":
diff --git a/examples/deepseek_v32/topk_selector.py b/examples/deepseek_v32/topk_selector.py
index 4a4b432775885a1632c430b79d1a7395ab28973f..244f74c69615430ca32ea847529639a1ccf65cca 100644
--- a/examples/deepseek_v32/topk_selector.py
+++ b/examples/deepseek_v32/topk_selector.py
@@ -8,24 +8,24 @@ pass_configs = {
 
 
 def convert_to_uint16(x):
-    hval = T.Cast("float16", x)
-    bits_uint = T.reinterpret("uint16", hval)
+    hval = T.Cast(T.float16, x)
+    bits_uint = T.reinterpret(T.uint16, hval)
     bits_uint = T.if_then_else(x < 0, ~bits_uint & (0xFFFF), bits_uint | (0x8000))
     return bits_uint >> 8
 
 
 def convert_to_uint32(x):
-    bits_uint = T.reinterpret("uint32", x)
+    bits_uint = T.reinterpret(T.uint32, x)
     bits_uint = T.if_then_else(
         x < 0,
-        ~bits_uint & T.Cast("uint32", (0xFFFFFFFF)),
-        bits_uint | T.Cast("uint32", (0x80000000)),
+        ~bits_uint & T.Cast(T.uint32, (0xFFFFFFFF)),
+        bits_uint | T.Cast(T.uint32, (0x80000000)),
     )
     return bits_uint
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
+def tl_topk_impl(topk, in_dtype=T.float32, out_dtype=T.int32):
     batch = T.dynamic("batch")
     seq_len = T.dynamic("seq_len")
     RADIX = 1 << 8
@@ -42,20 +42,20 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
         with T.Kernel(batch, threads=BLOCK_SIZE) as (bx):
             tx = T.get_thread_binding()
 
-            s_threshold_bin_id = T.alloc_shared([1], "int32")
-            s_histogram = T.alloc_shared([RADIX + 1], "int32")
-            s_num_input = T.alloc_shared([2], "int32")
-            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], "int32")
-
-            l_threshold_bin_id = T.alloc_var("int32")
-            l_new_topk = T.alloc_var("int32")
-            l_num_input = T.alloc_var("int32")
-            l_bin_id32 = T.alloc_var("int32")
-            l_val = T.alloc_var("int32")
-            l_start_pos = T.alloc_var("int32")
-            l_start_idx = T.alloc_var("int32")
-            l_end_idx = T.alloc_var("int32")
-            l_out_pos = T.alloc_var("int32")
+            s_threshold_bin_id = T.alloc_shared([1], T.int32)
+            s_histogram = T.alloc_shared([RADIX + 1], T.int32)
+            s_num_input = T.alloc_shared([2], T.int32)
+            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], T.int32)
+
+            l_threshold_bin_id = T.alloc_var(T.int32)
+            l_new_topk = T.alloc_var(T.int32)
+            l_num_input = T.alloc_var(T.int32)
+            l_bin_id32 = T.alloc_var(T.int32)
+            l_val = T.alloc_var(T.int32)
+            l_start_pos = T.alloc_var(T.int32)
+            l_start_idx = T.alloc_var(T.int32)
+            l_end_idx = T.alloc_var(T.int32)
+            l_out_pos = T.alloc_var(T.int32)
 
             l_new_topk = topk
             l_start_idx = starts[bx]
@@ -99,7 +99,7 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                 input_idx = s * BLOCK_SIZE + tx
                 if input_idx < l_end_idx and input_idx >= l_start_idx and input_idx < seq_len:
                     bin_id = convert_to_uint16(input[bx, input_idx])
-                    l_bin_id32 = T.Cast("int32", bin_id)
+                    l_bin_id32 = T.Cast(T.int32, bin_id)
                     if l_bin_id32 > l_threshold_bin_id:
                         # need a pos = T.atomic_add(s_histogram[bin_id32+1], 1)
                         pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True)
@@ -127,9 +127,9 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                 l_num_input = s_num_input[r_idx]
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.Cast(
+                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        )
                         T.atomic_add(s_histogram[l_bin_id32], 1)
                 T.sync_threads()
                 # cumsum
@@ -156,23 +156,20 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     T.sync_threads()
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.Cast(
+                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        )
                         if l_bin_id32 > l_threshold_bin_id:
-                            pos = T.atomic_add(
-                                s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                            pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                             index[bx, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                         elif l_bin_id32 == l_threshold_bin_id and l_new_topk > 0:
                             if round == 3:
-                                l_out_pos = T.atomic_add(
-                                    s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                                l_out_pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                                 if l_out_pos < topk:
                                     index[bx, l_out_pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                             else:
                                 pos = T.atomic_add(s_num_input[r_idx ^ 1], 1, return_prev=True)
-                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx,
-                                                                          s * BLOCK_SIZE + tx]
+                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
 
     return tl_topk_kernel
 
@@ -186,7 +183,6 @@ def tl_topk(input, starts, ends, topk):
 
 
 def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
-
     batch = 64
     seq_len = 32 * 1024
     topk = 2048
@@ -212,8 +208,7 @@ def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
         set_ref = set(ref_np)
         set_trt = set(trt_np)
         intersection = set_ref & set_trt
-        print("selected/all:", len(intersection), "/", len(set_ref), "=",
-              len(intersection) / len(set_ref))
+        print("selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
 
     # Performance test with CUDA events
 
diff --git a/examples/deepseek_v32/utils.py b/examples/deepseek_v32/utils.py
index 2ea34b14a465c2079d2726d2b2d108397a10c685..d7252e171108aa13396f6d3e91d84d04de1d3c17 100644
--- a/examples/deepseek_v32/utils.py
+++ b/examples/deepseek_v32/utils.py
@@ -23,8 +23,7 @@ def _is_equal(a, b):
     if isinstance(a, torch.Tensor):
         return a is b
     # Whitelist of types that are safe to compare by value for caching.
-    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(
-            b, (int, float, str, bool, type(None))):
+    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(b, (int, float, str, bool, type(None))):
         return a == b
     # For other types, we cannot guarantee a cheap and safe comparison, so we fail the cache check.
     return False
@@ -58,9 +57,11 @@ def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]
             if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
                 # For Tensors, check for object identity. For other types, check for equality.
                 # Python caches small integers, so `is` works for them but not for large integers like 4096.
-                if all(_is_equal(a, b) for a, b in zip(args, last_args)) and \
-                   set(kwargs.keys()) == set(last_kwargs.keys()) and \
-                   all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items()):
+                if (
+                    all(_is_equal(a, b) for a, b in zip(args, last_args))
+                    and set(kwargs.keys()) == set(last_kwargs.keys())
+                    and all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items())
+                ):
                     return last_result
 
         result = fn(*args, **kwargs)
@@ -79,73 +80,68 @@ def cal_seq_idx_from_cu_seqlens(cu_seqlens: torch.LongTensor, seq_len: int):
 
 
 @tensor_cache
-def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                      seq_len: int) -> torch.IntTensor:
-    seq_idx_for_q = torch.full((seq_len,),
-                               len(cu_seqlens_qs),
-                               dtype=torch.int32,
-                               device=cu_seqlens_qs.device)
+def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+    seq_idx_for_q = torch.full((seq_len,), len(cu_seqlens_qs), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        seq_idx_for_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = i
+        seq_idx_for_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = i
     return seq_idx_for_q
 
 
 @tensor_cache
-def cal_cu_seqlen_ks_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+def cal_cu_seqlen_ks_for_q(
+    cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, cu_seqlens_ks: torch.LongTensor, seq_len: int
+) -> torch.IntTensor:
     cu_seqlen_ks_for_each_q = torch.gather(
-        input=torch.cat([
-            cu_seqlens_ks,
-            torch.full((1,),
-                       torch.iinfo(torch.int32).max,
-                       dtype=torch.int32,
-                       device=cu_seqlens_qs.device)
-        ]),
+        input=torch.cat([cu_seqlens_ks, torch.full((1,), torch.iinfo(torch.int32).max, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
     return cu_seqlen_ks_for_each_q.int()
 
 
 @tensor_cache
-def cal_cu_seqlen_ke_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, cu_seqlens_ke: torch.LongTensor,
-                           q_start_idxs: torch.LongTensor, seq_len: int,
-                           kv_stride: int) -> torch.IntTensor:
+def cal_cu_seqlen_ke_for_q(
+    cu_seqlens_qs: torch.LongTensor,
+    cu_seqlens_qe: torch.LongTensor,
+    cu_seqlens_ks: torch.LongTensor,
+    cu_seqlens_ke: torch.LongTensor,
+    q_start_idxs: torch.LongTensor,
+    seq_len: int,
+    kv_stride: int,
+) -> torch.IntTensor:
     cu_seqlen_ke_for_each_q = torch.gather(
-        input=torch.cat(
-            [cu_seqlens_ke,
-             torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
+        input=torch.cat([cu_seqlens_ke, torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
-    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,),
-                                                 dtype=torch.int32,
-                                                 device=cu_seqlens_qs.device)
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
+    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = (torch.arange(
-            q_start_idxs[i],
-            q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i],
-            dtype=torch.int32,
-            device=cu_seqlens_qs.device) + 1) // kv_stride + cu_seqlens_ks[i]
+        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = (
+            torch.arange(
+                q_start_idxs[i], q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i], dtype=torch.int32, device=cu_seqlens_qs.device
+            )
+            + 1
+        ) // kv_stride + cu_seqlens_ks[i]
     cu_seqlen_ke_for_each_q = torch.minimum(casual_cu_seqlen_ke_for_each_q, cu_seqlen_ke_for_each_q)
     return cu_seqlen_ke_for_each_q.int()
 
 
 @tensor_cache
-def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
-                                cu_seqlens_k: torch.LongTensor = None,
-                                offs_q: torch.LongTensor = None,
-                                *,
-                                seq_len: int,
-                                kv_stride: int = 1,
-                                cp_rank: int = 0,
-                                cp_size: int = 1,
-                                balanced_cp=False):
-    '''
+def cal_ks_ke_from_cu_seqlen_qk(
+    cu_seqlens_q: torch.LongTensor,
+    cu_seqlens_k: torch.LongTensor = None,
+    offs_q: torch.LongTensor = None,
+    *,
+    seq_len: int,
+    kv_stride: int = 1,
+    cp_rank: int = 0,
+    cp_size: int = 1,
+    balanced_cp=False,
+):
+    """
     seq_len: seq len per cp rank
     balanced cp slice assignment: 0 1 2 3 3 2 1 0
-    '''
+    """
     n_seq = len(cu_seqlens_q) - 1
     assert n_seq > 0
     assert cu_seqlens_q.shape == (n_seq + 1,)
@@ -170,10 +166,12 @@ def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
 
         def f(x: torch.Tensor):
             chunks = x.chunk(cp_size * 2)
-            return torch.cat([
-                chunks[cp_rank],
-                chunks[cp_size - cp_rank - 1],
-            ])
+            return torch.cat(
+                [
+                    chunks[cp_rank],
+                    chunks[cp_size - cp_rank - 1],
+                ]
+            )
 
         ks = f(ks)
         ke = f(ke)
@@ -189,8 +187,7 @@ def ceil_to_ue8m0(x: torch.Tensor):
     return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
 
 
-def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int],
-                                use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int], use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
     excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
     x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
     sf = x_amax / 448.0
@@ -239,14 +236,18 @@ def generate_random_cu_seqlens(per_cp_seqlen, cp_size=4, cp_rank=3, kv_stride=1,
         total_seqlen - (cp_rank + 1) * per_chunk_seqlen,
         total_seqlen - cp_rank * per_chunk_seqlen,
     )
-    ks = torch.cat([
-        cu_seqlens_ks_for_each_q[slice_short],
-        cu_seqlens_ks_for_each_q[slice_long],
-    ])
-    ke = torch.cat([
-        cu_seqlens_ke_for_each_q[slice_short],
-        cu_seqlens_ke_for_each_q[slice_long],
-    ])
+    ks = torch.cat(
+        [
+            cu_seqlens_ks_for_each_q[slice_short],
+            cu_seqlens_ks_for_each_q[slice_long],
+        ]
+    )
+    ke = torch.cat(
+        [
+            cu_seqlens_ke_for_each_q[slice_short],
+            cu_seqlens_ke_for_each_q[slice_long],
+        ]
+    )
     assert len(ks) == len(ke) == per_cp_seqlen
     return ks, ke
 
@@ -302,11 +303,9 @@ def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
         raise_assert: Whether to raise assertion error on failure
     """
     sim = calculate_tensor_similarity(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print(
-            f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m"
-        )
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
         if raise_assert:
             assert False  # noqa: B011
 
@@ -316,11 +315,8 @@ if __name__ == "__main__":
     cu_seqlens = torch.randint(128, 4096, (1000,), dtype=torch.int32, device="cuda")
     last_idx = torch.where(cu_seqlens.cumsum(dim=0) >= seq_len)[0][0]
     cu_seqlens_cumsum = cu_seqlens[:last_idx].cumsum(dim=0)
-    cu_seqlens_qs = torch.cat(
-        [torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
-    cu_seqlens_qe = torch.cat(
-        [cu_seqlens_cumsum,
-         torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
+    cu_seqlens_qs = torch.cat([torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
+    cu_seqlens_qe = torch.cat([cu_seqlens_cumsum, torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
 
     from tilelang.profiler import do_bench
 
diff --git a/examples/dequantize_gemm/dequantize_utils.py b/examples/dequantize_gemm/dequantize_utils.py
index b14c0aee687636e7bb8e85b3ffbdaaeec191bb58..90a6265ffa4bf22c3d583e74e53066161c80a37a 100644
--- a/examples/dequantize_gemm/dequantize_utils.py
+++ b/examples/dequantize_gemm/dequantize_utils.py
@@ -39,12 +39,10 @@ def torch_convert_bit_twiddling(tensor):
     res0 = val_concat_expanded & mask
     res1 = (val_concat_expanded << 3) & mask
     res2 = (val_concat_expanded << 6) & mask
-    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | (
-        (val_concat_expanded >> 7) & mask3)
+    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | ((val_concat_expanded >> 7) & mask3)
 
     # Select the correct result based on position
-    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1,
-                                                   torch.where(pos == 2, res2, res3)))
+    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1, torch.where(pos == 2, res2, res3)))
 
     # Convert to uint16 for .view(torch.bfloat16)
     bf16_uint16 = (bf16 & 0xFFFF).to(torch.uint16)
@@ -110,7 +108,7 @@ def print_bit(name, val):
         val (torch.Tensor): A scalar PyTorch tensor (numeric) whose 32-bit binary representation will be shown.
     """
     val_cpu = val.cpu().item()
-    binary_repr = f'{val_cpu:032b}'
+    binary_repr = f"{val_cpu:032b}"
     print(name, binary_repr)
 
 
@@ -122,7 +120,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -132,21 +130,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = (1. - sim).item()
-    print(f'{diff=}')
+    diff = (1.0 - sim).item()
+    print(f"{diff=}")
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff=}')
+        print_red_warning(f"{name} Error: {diff=}")
         if raise_assert:
             raise AssertionError
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
index e30845b8d777194e91a4cf3af8bbe0939a7dc56d..2d9c945b36083073e020b424725da949614d4df0 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
@@ -24,6 +24,7 @@ def get_configs():
         the parameter name to its chosen value.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -32,65 +33,64 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           fast_dequant=True,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    fast_dequant=True,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
+    """
+    Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
+
+    This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
+    - A: dense input of shape (M, K) with dtype `in_dtype`.
+    - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
+    - C: output of shape (M, N) with dtype `out_dtype`.
+
+    The generated kernel supports two dequantization paths:
+    - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
+    - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
+
+    Important behavior and requirements:
+    - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
+    - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
+    - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
+    - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
+    - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
+
+    Parameters that alter kernel layout/behavior (brief):
+    - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
+    - num_stages: number of software pipeline stages for the K-loop.
+    - threads: number of threads used per kernel block.
+    - split: extra K-splitting factor; K must be divisible by block_K * split.
+    - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
+
+    Returns:
+        A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
     """
-           Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
-
-           This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
-           - A: dense input of shape (M, K) with dtype `in_dtype`.
-           - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
-           - C: output of shape (M, N) with dtype `out_dtype`.
-
-           The generated kernel supports two dequantization paths:
-           - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
-           - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
-
-           Important behavior and requirements:
-           - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
-           - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
-           - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
-           - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
-           - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
-
-           Parameters that alter kernel layout/behavior (brief):
-           - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
-           - num_stages: number of software pipeline stages for the K-loop.
-           - threads: number of threads used per kernel block.
-           - split: extra K-splitting factor; K must be divisible by block_K * split.
-           - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
-
-           Returns:
-               A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
-           """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
 
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
@@ -121,7 +121,7 @@ def matmul(M,
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a TileLang macro that performs fast, twiddling-based dequantization from packed FP4 to BF16 using an external runtime plugin.
 
@@ -131,13 +131,13 @@ def matmul(M,
         - Writes the dequantized BF16 values back to a shared dequantized buffer for use by the kernel.
 
         Notes and preconditions:
-        - Asserts that `in_dtype == "fp4"` and `out_dtype == "bfloat16"`.
+        - Asserts that `in_dtype == "fp4"` and `out_dtype == T.bfloat16`.
         - The generated macro depends on several surrounding-scope symbols (e.g., `import_source`, `func_name`, `block_K`, `Block_QK`, `threads`, `num_elems_per_byte`, `storage_dtype`, and `out_dtype`) and expects them to be defined consistently in the enclosing kernel.
         - The macro is optimized for block-wise, per-thread transactions sized to the target storage width (uses a MAX_TRANSACTION_SIZE_BITS constant) and uses local/register buffers sized accordingly.
         - The macro uses `T.import_source` to bring the external plugin into the module and `T.call_extern` to perform the high-throughput dequantization; callers must ensure the external function matches the expected calling convention and memory layout.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -189,12 +189,11 @@ def matmul(M,
                 # Finally, store the dequantized data to shared memory.
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple TIR dequantization macro that converts packed 4-bit FP (FP4) stored in uint8 into bfloat16.
 
@@ -205,7 +204,7 @@ def matmul(M,
         - Writes the dequantized bfloat16 block into B_dequantize_shared.
 
         Constraints:
-        - Supports only in_dtype="fp4" and out_dtype="bfloat16".
+        - Supports only in_dtype="fp4" and out_dtype=T.bfloat16.
         - The helper assumes nbit == 4 and produces bfloat16 values.
         - The macro uses a fixed test-scale of 0 (no per-element scaling) as written.
 
@@ -213,49 +212,49 @@ def matmul(M,
             A TIR macro function performing the described in-place block dequantization from packed uint8 FP4 to bfloat16.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
-        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr,
-                                  scale: tir.PrimExpr, dtype: str):
+        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
             """
-                Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
-
-                This helper extracts the 4-bit field located at the bit position `pos` within the
-                byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
-                exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
-                resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
-
-                Parameters:
-                    nbit (int): Number of bits in the packed element; must be 4.
-                    val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
-                    pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
-                    scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
-                    dtype (str): Target dtype string; must be "bfloat16".
-
-                Returns:
-                    tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
-
-                Notes:
-                    - The function asserts `nbit == 4`, `dtype == "bfloat16"`, and that `val.dtype` is "uint8".
-                    - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
-                    bit fields and clamps the computed exponent to fit into 8 bits.
+            Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
+
+            This helper extracts the 4-bit field located at the bit position `pos` within the
+            byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
+            exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
+            resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
+
+            Parameters:
+                nbit (int): Number of bits in the packed element; must be 4.
+                val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
+                pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
+                scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
+                dtype (str): Target dtype string; must be T.bfloat16.
+
+            Returns:
+                tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
+
+            Notes:
+                - The function asserts `nbit == 4`, `dtype == T.bfloat16`, and that `val.dtype` is T.uint8.
+                - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
+                bit fields and clamps the computed exponent to fit into 8 bits.
             """
             assert nbit == 4
-            assert dtype == "bfloat16"
-            assert val.dtype == "uint8"
-            mask = tir.const((1 << nbit) - 1, "uint16")
-            f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-            s = f4 >> tir.const(3, "uint16")
-            e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+            assert dtype == T.bfloat16
+            assert val.dtype == T.uint8
+            mask = tir.const((1 << nbit) - 1, T.uint16)
+            f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+            s = f4 >> tir.const(3, T.uint16)
+            e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
             # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-            e_bf16 = e_f4 + tir.const(126, "uint16")
+            e_bf16 = e_f4 + tir.const(126, T.uint16)
             # Scale is the exponential part, within the representation of uint8
             # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-            m_f4 = f4 & tir.const(1, "uint16")
+            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+            m_f4 = f4 & tir.const(1, T.uint16)
             val_bf16 = tir.reinterpret(
-                "bfloat16", ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                             | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+                T.bfloat16,
+                ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
+            )
             return val_bf16
 
         @T.macro
@@ -292,32 +291,32 @@ def matmul(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
-
-            This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
-            - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
-            - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
-            - Pipelines over K in chunks of `block_K` for `num_stages` stages:
-              - Loads A and packed B tiles into shared memory.
-              - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
-              - Performs a GEMM accumulating into C_local with B transposed.
-            - Stores the accumulated block from C_local back to the global output C via C_shared.
-
-            Parameters:
-            - A: input tile of shape (M, K) with dtype `in_dtype`.
-            - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
-            - C: output tensor of shape (M, N) with dtype `out_dtype`.
-
-            Side effects:
-            - Writes the computed output block into the global tensor `C`.
-            - Uses and updates shared memory buffers and per-thread accumulators.
-
-            No value is returned.
+        Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
+
+        This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
+        - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
+        - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
+        - Pipelines over K in chunks of `block_K` for `num_stages` stages:
+          - Loads A and packed B tiles into shared memory.
+          - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
+          - Performs a GEMM accumulating into C_local with B transposed.
+        - Stores the accumulated block from C_local back to the global output C via C_shared.
+
+        Parameters:
+        - A: input tile of shape (M, K) with dtype `in_dtype`.
+        - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
+        - C: output tensor of shape (M, N) with dtype `out_dtype`.
+
+        Side effects:
+        - Writes the computed output block into the global tensor `C`.
+        - Uses and updates shared memory buffers and per-thread accumulators.
+
+        No value is returned.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -327,9 +326,11 @@ def matmul(M,
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
 
             T.clear(C_local)
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -344,7 +345,7 @@ def matmul(M,
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -363,7 +364,7 @@ def ref_program_twiddling(A, qB):
     Returns:
         torch.Tensor: Result matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -383,7 +384,7 @@ def ref_program_simple(A, qB):
     Returns:
         torch.Tensor: Resulting matrix C in bfloat16 with shape (M, N).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -409,16 +410,15 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
     """
     total_flops = 2 * m * n * k
     if tune:
-        kernel = matmul(
-            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, fast_dequant=fast_dequant)
+        kernel = matmul(m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, fast_dequant=fast_dequant)
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             fast_dequant=fast_dequant,
             block_M=256,
@@ -426,7 +426,8 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
             block_K=128,
             num_stages=2,
             threads=256,
-            split=1)
+            split=1,
+        )
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
     if fast_dequant:
         profiler.assert_allclose(ref_program_twiddling, rtol=0.01, atol=0.01)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
index ac1417aebc87c3842eec2600d04a4790677ed352..cc0375a1db8d89c732f3053a8a3280ccfb7df940 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -7,45 +7,45 @@ import torch
 from dequantize_utils import torch_convert_bit_twiddling, torch_convert
 
 
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
     """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
+    Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
 
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
+    This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
+    bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
+    `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
 
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
+    Parameters:
+        nbit (int): Number of bits in the packed field (must be 4).
+        val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
+        pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
+        scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
+        dtype (str): Destination dtype string (must be T.bfloat16).
 
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
+    Returns:
+        tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
+    Notes:
+    - Preconditions are enforced via assertions: nbit == 4, dtype == T.bfloat16, and val.dtype == T.uint8.
+    - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
+    """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(
+        T.bfloat16,
+        ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
+    )
     return val_bf16
 
 
@@ -65,6 +65,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -73,70 +74,74 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(
+    out_idx=[-1],
+)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
+    Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
 
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
+    The generated kernel accepts:
+    - A: dense matrix with element type `in_dtype`.
+    - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
+    - Scale: per-block scale/exponent information used to dequantize B.
+    The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
+    - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
+    - fast_dequant (False): uses a simple elementwise dequantization helper.
+
+    Parameters:
+    M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
+    in_dtype (str): element type of A (e.g., "fp4" in this file).
+    out_dtype (str): output tensor element type (e.g., T.bfloat16).
+    accum_dtype (str): accumulation type used for the inner GEMM.
+    source_format (str, optional): format string passed to intrinsic selector (default "uint").
+    num_bits (int, optional): number of bits per quantized element in B (default 4).
+    scale_size (int, optional): number of elements grouped per scale entry (default 32).
+    fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
+    block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
+    num_stages (int, optional): pipelining stages for K loop (default 2).
+    threads (int, optional): threads per block used by the kernel (default 256).
+    split (int, optional): split factor along K used by the scheduler (default 1).
+    with_bias (bool, optional): whether to add Bias to the output (default False).
+
+    Returns:
+    A T.prim_func implementing the tiled, pipelined GEMM that:
+    - loads tiled blocks of A and packed B to shared memory,
+    - dequantizes B via the chosen path into a shared dequantized tile,
+    - performs a tiled GEMM accumulating into local fragments,
+    - writes the final MxN block to the global output tensor.
+
+    Notes:
+    - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
+    - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
+    - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shape = (M, K)
@@ -150,6 +155,7 @@ def matmul(M,
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -164,7 +170,7 @@ def matmul(M,
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
 
@@ -175,12 +181,12 @@ def matmul(M,
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -252,24 +258,23 @@ def matmul(M,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
 
         Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
 
         Notes:
-        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - Only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
         - Scale holds the exponent-like scaling values indexed per output element as used by the conversion helper.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale, k):
@@ -301,33 +306,32 @@ def matmul(M,
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale[
-                        bx * block_N + i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        bx * block_N + i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
-                ) * T.shift_left(
-                    1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
+                ) * T.shift_left(1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
 
         return simple_dequant_bf16_fp4
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        Scale: T.Tensor(Scale_shape, storage_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
+        Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
 
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
+        This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
 
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
+        Parameters are self-descriptive in the signature; notable behaviors:
+        - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
+        - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
+        - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
+        - The function writes results in-place into C.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -337,23 +341,26 @@ def matmul(M,
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
 
             if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
+                T.annotate_layout(
+                    {
+                        Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
+                    }
+                )
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
             if with_bias:
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       Bias_shared)
+                T.copy(Bias[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N], Bias_shared)
                 T.copy(Bias_shared, C_local)
             else:
                 T.clear(C_local)
@@ -368,7 +375,7 @@ def matmul(M,
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -387,9 +394,9 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -410,9 +417,9 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -434,9 +441,9 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -462,9 +469,9 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -491,24 +498,16 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+        )
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             scale_size=scale_size,
             block_M=256,
@@ -518,7 +517,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
             threads=256,
             split=1,
             fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            with_bias=with_bias,
+        )
 
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
 
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
index 7dad795971f7a9707bd4a39c339b98d1ae0a15ac..9e90418bc75e73a4345525e7ecc339b254a0ab0c 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
@@ -7,45 +7,45 @@ import torch
 from dequantize_utils import torch_convert_bit_twiddling, torch_convert
 
 
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
     """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
+    Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
 
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
+    This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
+    bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
+    `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
 
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
+    Parameters:
+        nbit (int): Number of bits in the packed field (must be 4).
+        val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
+        pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
+        scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
+        dtype (str): Destination dtype string (must be T.bfloat16).
 
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
+    Returns:
+        tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
+    Notes:
+    - Preconditions are enforced via assertions: nbit == 4, dtype == T.bfloat16, and val.dtype == T.uint8.
+    - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
+    """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(
+        T.bfloat16,
+        ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
+    )
     return val_bf16
 
 
@@ -65,6 +65,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -73,70 +74,74 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(
+    out_idx=[-1],
+)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
+    Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
 
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
+    The generated kernel accepts:
+    - A: dense matrix with element type `in_dtype`.
+    - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
+    - Scale: per-block scale/exponent information used to dequantize B.
+    The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
+    - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
+    - fast_dequant (False): uses a simple elementwise dequantization helper.
+
+    Parameters:
+    M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
+    in_dtype (str): element type of A (e.g., "fp4" in this file).
+    out_dtype (str): output tensor element type (e.g., T.bfloat16).
+    accum_dtype (str): accumulation type used for the inner GEMM.
+    source_format (str, optional): format string passed to intrinsic selector (default "uint").
+    num_bits (int, optional): number of bits per quantized element in B (default 4).
+    scale_size (int, optional): number of elements grouped per scale entry (default 32).
+    fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
+    block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
+    num_stages (int, optional): pipelining stages for K loop (default 2).
+    threads (int, optional): threads per block used by the kernel (default 256).
+    split (int, optional): split factor along K used by the scheduler (default 1).
+    with_bias (bool, optional): whether to add Bias to the output (default False).
+
+    Returns:
+    A T.prim_func implementing the tiled, pipelined GEMM that:
+    - loads tiled blocks of A and packed B to shared memory,
+    - dequantizes B via the chosen path into a shared dequantized tile,
+    - performs a tiled GEMM accumulating into local fragments,
+    - writes the final MxN block to the global output tensor.
+
+    Notes:
+    - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
+    - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
+    - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shape = (M, K)
@@ -150,6 +155,7 @@ def matmul(M,
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -164,7 +170,7 @@ def matmul(M,
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
 
@@ -175,12 +181,12 @@ def matmul(M,
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -252,24 +258,23 @@ def matmul(M,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
 
         Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
 
         Notes:
-        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - Only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
         - Scale holds the exponent-like scaling values indexed per output element as used by the conversion helper.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
@@ -301,8 +306,8 @@ def matmul(M,
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
                 ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
@@ -311,22 +316,22 @@ def matmul(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        Scale: T.Tensor(Scale_shape, storage_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
+        Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
 
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
+        This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
 
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
+        Parameters are self-descriptive in the signature; notable behaviors:
+        - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
+        - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
+        - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
+        - The function writes results in-place into C.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -339,16 +344,20 @@ def matmul(M,
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
 
             if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
+                T.annotate_layout(
+                    {
+                        Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
+                    }
+                )
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
@@ -357,26 +366,24 @@ def matmul(M,
                 # T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
                 #        Bias_shared)
                 # T.copy(Bias_shared, C_local)
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       C_local)
+                T.copy(Bias[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N], C_local)
             else:
                 T.clear(C_local)
 
             # Use 1D TMA to load Scale
-            T.copy(Scale[bx * block_N:(bx + 1) * block_N, :], Scale_shared)
+            T.copy(Scale[bx * block_N : (bx + 1) * block_N, :], Scale_shared)
 
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
+                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 else:
                     get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -395,11 +402,11 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -420,11 +427,11 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -446,11 +453,11 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -476,11 +483,11 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -507,24 +514,16 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+        )
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             scale_size=scale_size,
             block_M=256,
@@ -534,7 +533,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
             threads=256,
             split=1,
             fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            with_bias=with_bias,
+        )
 
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
 
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
index 727d6d3b6ff5790d02bd6afe7832fd69aa76f8b3..37826874bc3dfe2ee980f3b8274d56cca8bed3c6 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -24,8 +24,9 @@ def matmul(
     num_bits=4,
 ):
     from tilelang.quantize import _tir_packed_to_unsigned_convert
+
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     storage_type = str("".join(c for c in storage_dtype if not c.isdigit()))
     A_shape = (M, K)
@@ -39,9 +40,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -58,21 +59,19 @@ def matmul(
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
                         index = i * threads * local_size_compressed + tx * local_size_compressed + v
                         vi = index // (block_K // num_elems_per_byte)
                         vj = index % (block_K // num_elems_per_byte)
                         B_local[v] = B_shared[vi, vj]
                     for v in T.serial(0, local_size):
-                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(
-                            storage_type, storage_nbit)(
-                                num_bits,
-                                B_local[v // num_elems_per_byte],
-                                v % num_elems_per_byte,
-                                dtype=in_dtype,
-                            )
+                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(storage_type, storage_nbit)(
+                            num_bits,
+                            B_local[v // num_elems_per_byte],
+                            v % num_elems_per_byte,
+                            dtype=in_dtype,
+                        )
                     for v in T.vectorized(0, local_size):
                         index = i * threads * local_size + tx * local_size + v
                         vi = index // block_K
@@ -121,9 +120,7 @@ def run_gemm(
     def ref_program(A, qB):
         import torch
 
-        B = (
-            torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                        dtype=torch.half).to(torch.half).to(A.device))
+        B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
         for i in range(B.shape[0]):
             for j in range(B.shape[1]):
                 B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -146,25 +143,27 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 ):
     from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout as make_swizzle_layout
     from tilelang.intrinsics.mma_macro_generator import (
-        TensorCoreIntrinEmitterWithLadderTransform,)
+        TensorCoreIntrinEmitterWithLadderTransform,
+    )
 
     from bitblas.gpu.intrin.lop3 import decode_i4_to_f16
+
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -183,7 +182,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     block_M = block_row_warps * warp_row_tiles
     block_N = block_col_warps * warp_col_tiles
-    block_K = 32 if in_dtype == "float16" else 64
+    block_K = 32 if in_dtype == T.float16 else 64
     chunk = block_K // reduce_k
 
     is_smooth_a = False
@@ -192,8 +191,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
     pad_factor = 8
 
     A_shape = (M, K)
-    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y,
-               micro_size_k // num_elems_per_byte)
+    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y, micro_size_k // num_elems_per_byte)
     A_shared_shape = (block_M, (block_K + pad_factor) if apply_pad_a else block_K)
     B_shared_shape = (
         block_N // micro_size_y,
@@ -228,7 +226,8 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
         chunk=chunk,
         reduce_k=reduce_k,
         transform_kind_b=transform_b,
-        num_elems_per_byte=num_elems_per_byte)
+        num_elems_per_byte=num_elems_per_byte,
+    )
 
     vec_load_qb = 16
     if block_N * (block_K // reduce_k) // num_elems_per_byte // threads < vec_load_qb:
@@ -236,14 +235,11 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads,
-                prelude=decode_i4_to_f16) as (bx, by):
-
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads, prelude=decode_i4_to_f16) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -255,40 +251,36 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
             thread_binding = T.get_thread_binding(0)
             rk = T.get_thread_binding(1)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             T.use_swizzle(panel_size=10)
 
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, (block_K // reduce_k)):
                     vk = rk * (block_K // reduce_k) + k
                     A_shared[i, vk] = A[by * block_M + i, ko * block_K + vk]
 
                 # TODO(lei): Layout Inference Pass is not efficient to handle the four dims int8 load
-                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte //
-                                  (threads * vec_load_qb)):
+                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte // (threads * vec_load_qb)):
                     for v in T.vectorized(0, vec_load_qb):
                         t = thread_binding
                         idx = i * threads * vec_load_qb * reduce_k + rk * threads * vec_load_qb + t * vec_load_qb + v
                         vkk = idx % (micro_size_k // num_elems_per_byte)
                         vjj = (idx // (micro_size_k // num_elems_per_byte)) % micro_size_y
-                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (
-                            block_K // micro_size_k)
-                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y //
-                              (block_K // micro_size_k)) % (
-                                  block_N // micro_size_y)
-                        B_shared[vj, vk, vjj,
-                                 vkk] = B[bx * (block_N // micro_size_y) + vj,
-                                          ko * (block_K // micro_size_k) + vk, vjj, vkk]
+                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (block_K // micro_size_k)
+                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y // (block_K // micro_size_k)) % (
+                            block_N // micro_size_y
+                        )
+                        B_shared[vj, vk, vjj, vkk] = B[bx * (block_N // micro_size_y) + vj, ko * (block_K // micro_size_k) + vk, vjj, vkk]
 
                 for ki in T.serial(0, (block_K // (micro_size_k * reduce_k))):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -307,9 +299,13 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
                     for j in T.serial(warp_cols):
                         local_size_b = mma_emitter.local_size_b
-                        T.call_extern('handle', 'decode_i4u_to_f16',
-                                      T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
-                                      T.address_of(B_dequantize_local[j * local_size_b]), 8)
+                        T.call_extern(
+                            "handle",
+                            "decode_i4u_to_f16",
+                            T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
+                            T.address_of(B_dequantize_local[j * local_size_b]),
+                            8,
+                        )
 
                     mma_emitter.mma(A_local, B_dequantize_local, C_local)
 
@@ -328,7 +324,8 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
                             reduced_accum_res[0],
                             rk,
                             dtype="handle",
-                        ))
+                        )
+                    )
                     if rk == 0:
                         C_local[n] = reduced_accum_res[0]
 
@@ -340,9 +337,9 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
             for i, j in T.Parallel(block_M, (block_N // reduce_k)):
                 vj = rk * (block_N // reduce_k) + j
-                C[by * block_M + i,
-                  bx * block_N + vj] = C_shared[i // micro_size_x, vj // micro_size_y,
-                                                i % micro_size_x, vj % micro_size_y]
+                C[by * block_M + i, bx * block_N + vj] = C_shared[
+                    i // micro_size_x, vj // micro_size_y, i % micro_size_x, vj % micro_size_y
+                ]
 
     return main
 
@@ -357,8 +354,8 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     transform_b,
 ):
     import bitblas
-    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
-        M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
+
+    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
 
     kernel = tilelang.compile(matmul, out_idx=[2])
     src_code = kernel.get_kernel_source()
@@ -368,11 +365,10 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     assert src_code is not None
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
     C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, accum_dtype))
 
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
@@ -407,9 +403,7 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     # Ensure that the latency is not None
     assert latency is not None
 
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
             B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -423,14 +417,13 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
 
 @tilelang.testing.requires_package("bitblas")
 def test_run_dequantize_gemm():
-    run_gemm(256, 256, 256, "float16", "float16", "float16", 128, 128, 32, num_threads=128)
-    run_gemm(256, 256, 256, "int8", "int32", "int32", 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.float16, T.float16, T.float16, 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.int8, T.int32, T.int32, 128, 128, 32, num_threads=128)
 
 
 @tilelang.testing.requires_package("bitblas")
 def test_assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4():
-    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(
-        256, 1024, 512, "float16", "float16", "float16", 3)
+    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(256, 1024, 512, T.float16, T.float16, T.float16, 3)
 
 
 def main():
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
index c5588d516cffcdd9a29f7cd804d2ab0b5cd79fec..79345771d6bd2e565547016c39e6ee2a4611201d 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
@@ -9,30 +9,29 @@ import argparse
 
 def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint8"
+    assert dtype == T.float16
+    assert val.dtype == T.uint8
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + ExponentialBias(f16, f4) = e_f4 + (2^4 - 2^1) = e_f4 + 14
     # s1e2m1
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
-    e_f16 = e_f4 + tir.const(14, "uint16")
-    m_f4 = f4 & tir.const(1, "uint16")
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
+    e_f16 = e_f4 + tir.const(14, T.uint16)
+    m_f4 = f4 & tir.const(1, T.uint16)
     m_f16 = m_f4
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")
-                               | m_f16 << tir.const(9, "uint16")).astype("uint16"))
-    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
+    val_f16 = tir.reinterpret(
+        T.float16, ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16) | m_f16 << tir.const(9, T.uint16)).astype(T.uint16)
+    )
+    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, T.float16), val_f16)
     return val_f16
 
 
 def torch_convert(tensor):
-
     def print_bit(name, val):
         val_cpu = val.cpu().item()
-        binary_repr = f'{val_cpu:032b}'
+        binary_repr = f"{val_cpu:032b}"
         print(name, binary_repr)
 
     def _convert(val, pos):
@@ -61,15 +60,15 @@ def torch_convert(tensor):
 @tilelang.jit(out_idx=[1])
 def test_convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -99,7 +98,7 @@ def test_fp4_fp16_convert_close():
         K,
         block_N,
         block_K,
-        "float16",
+        T.float16,
     )
 
     B = torch.randint(0, 16, (N, K // 2), dtype=torch.uint8, device="cuda").to(torch.uint8)
@@ -118,23 +117,15 @@ def get_configs():
     splits = [1]
     _configs = list(itertools.product(block_M, block_N, block_K, num_stages, threads, splits))
 
-    configs = [{
-        'block_M': c[0],
-        'block_N': c[1],
-        'block_K': c[2],
-        'num_stages': c[3],
-        'threads': c[4],
-        'split': c[5]
-    } for c in _configs]
+    configs = [{"block_M": c[0], "block_N": c[1], "block_K": c[2], "num_stages": c[3], "threads": c[4], "split": c[5]} for c in _configs]
     return configs
 
 
 def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -145,17 +136,12 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
         @T.prim_func
         def main_split(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            SplitC = T.alloc_buffer([
-                split, (N + block_N - 1) // block_N * block_N,
-                (M + block_M - 1) // block_M * block_M
-            ], out_dtype)
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), split,
-                    threads=threads) as (bx, by, bz):
+            SplitC = T.alloc_buffer([split, (N + block_N - 1) // block_N * block_N, (M + block_M - 1) // block_M * block_M], out_dtype)
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split, threads=threads) as (bx, by, bz):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -164,10 +150,12 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // (block_K * split), num_stages=num_stages):
@@ -183,8 +171,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                         )
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
-                T.copy(Ct_local, SplitC[bz, bx * block_N:(bx + 1) * block_N,
-                                        by * block_M:(by + 1) * block_M])
+                T.copy(Ct_local, SplitC[bz, bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
                 acc = T.alloc_fragment((block_N, block_M), out_dtype)
                 T.clear(acc)
@@ -195,12 +182,11 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -209,10 +195,12 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -229,8 +217,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         if split == 1:
             return main
@@ -241,12 +228,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
         @autotune(configs=get_configs(), warmup=10, rep=10)
         @tilelang.jit(out_idx=[2])
-        def kernel(block_M=None,
-                   block_N=None,
-                   block_K=None,
-                   num_stages=None,
-                   threads=None,
-                   split=None):
+        def kernel(block_M=None, block_N=None, block_K=None, num_stages=None, threads=None, split=None):
             return kernel_func(block_M, block_N, block_K, num_stages, threads, split).prim_func
 
         return kernel()
@@ -259,7 +241,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
 
 def ref_program(A, qB):
-    dtypeC = "float16"
+    dtypeC = T.float16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -269,10 +251,10 @@ def ref_program(A, qB):
 def main(m=256, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
 
-    if (not tune):
-        kernel = matmul(
-            m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)(
-                block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1)
+    if not tune:
+        kernel = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)(
+            block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
@@ -283,7 +265,7 @@ def main(m=256, n=256, k=256, tune=False):
         print("Tile-lang: {:.2f} ms".format(latency))
         print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
     else:
-        best_result = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)
+        best_result = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Best latency: {best_latency}")
@@ -293,10 +275,10 @@ def main(m=256, n=256, k=256, tune=False):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--m', type=int, default=256, help='M')
-    parser.add_argument('--n', type=int, default=256, help='N')
-    parser.add_argument('--k', type=int, default=256, help='K')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--m", type=int, default=256, help="M")
+    parser.add_argument("--n", type=int, default=256, help="N")
+    parser.add_argument("--k", type=int, default=256, help="K")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     M, N, K = args.m, args.n, args.k
     main(M, N, K, args.tune)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
index 52ee8216f51208b98335416f20c0782b7a5c3f2d..61baa668e6eb0853c4a6c2d93a5e05dc3f254e46 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
@@ -9,15 +9,15 @@ import argparse
 
 def _tir_u8_to_i4_to_i8(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "int8"
-    assert val.dtype == "uint8"
+    assert dtype == T.int8
+    assert val.dtype == T.uint8
 
-    mask = tir.const((1 << nbit) - 1, "uint8")
+    mask = tir.const((1 << nbit) - 1, T.uint8)
 
-    i4 = (val >> (pos.astype("uint8") * tir.const(nbit, "uint8"))) & mask
+    i4 = (val >> (pos.astype(T.uint8) * tir.const(nbit, T.uint8))) & mask
 
-    i8_shifted = tir.reinterpret("int8", i4 << tir.const(4, "uint8"))
-    i8 = i8_shifted >> tir.const(4, "int8")
+    i8_shifted = tir.reinterpret(T.int8, i4 << tir.const(4, T.uint8))
+    i8 = i8_shifted >> tir.const(4, T.int8)
     return i8
 
 
@@ -35,15 +35,15 @@ def get_configs():
 @tilelang.jit(out_idx=[1])
 def _convert_test(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -66,13 +66,12 @@ def _convert_test(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
 
 
 def torch_convert(tensor):
-
     def _convert(val, pos):
         assert val.dtype == torch.uint8
         val = val.view(torch.int8)
         mask = (1 << 4) - 1
-        i4_shifted = ((val >> (pos * 4)) & mask)
-        i4 = ((i4_shifted << 4) >> 4)
+        i4_shifted = (val >> (pos * 4)) & mask
+        i4 = (i4_shifted << 4) >> 4
 
         return i4.view(torch.int8)
 
@@ -86,7 +85,7 @@ def torch_convert(tensor):
 
 
 def ref_program(A, qB):
-    dtypeC = "int32"
+    dtypeC = T.int32
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -94,11 +93,10 @@ def ref_program(A, qB):
 
 
 def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -109,12 +107,11 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -123,10 +120,12 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -143,8 +142,7 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         return main
 
@@ -167,10 +165,10 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
 
 def main(m=128, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
-    if (not tune):
-        kernel = matmul_int8xint4(
-            m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)(
-                block_M=32, block_N=32, block_K=128, num_stages=1, threads=128)
+    if not tune:
+        kernel = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)(
+            block_M=32, block_N=32, block_K=128, num_stages=1, threads=128
+        )
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program, rtol=1e-2, atol=1e-2)
         print("All checks pass.")
@@ -179,7 +177,7 @@ def main(m=128, n=256, k=256, tune=False):
         print(f"Tilelang: {latency} ms")
 
     else:
-        best_result = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)
+        best_result = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Bset latency: {best_latency}")
diff --git a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
index d3e90ec9323052ee631f693e1f8228cc1f8695b7..dea2e5ddd8a1e763cac46c1e07199b5c6077830c 100644
--- a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
+++ b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
@@ -4,7 +4,8 @@ from typing import Optional, Callable, Any
 import torch
 from tilelang import DataType
 from tilelang.quantize import (
-    _tir_packed_int_to_int_convert,)
+    _tir_packed_int_to_int_convert,
+)
 
 
 @tilelang.jit
@@ -16,7 +17,7 @@ def dequantize_gemv(
     out_dtype: str,
     accum_dtype: str,
     num_bits: int = 4,
-    storage_dtype: str = "int8",
+    storage_dtype: T.dtype = T.int8,
     source_format: str = "uint",
     n_partition: int = 4,
     reduce_thread: int = 32,
@@ -26,11 +27,10 @@ def dequantize_gemv(
     group_size: int = -1,
     with_scaling: bool = False,
 ) -> Callable[..., Any]:
-
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert trans_A is False, "Dequantize only implement for trans_A=False currently"
     assert trans_B is True, "Dequantize only implement for trans_B=TRue currently"
@@ -51,7 +51,7 @@ def dequantize_gemv(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     import_source: Optional[str] = None
     func_name: str = ""
@@ -81,12 +81,12 @@ def dequantize_gemv(
         C: T.Tensor[C_shape, out_dtype],
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -107,8 +107,7 @@ def dequantize_gemv(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 if fast_decoding:
@@ -120,10 +119,9 @@ def dequantize_gemv(
                     )
                 else:
                     for ki in T.serial(micro_size_k):
-                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(
-                            storage_type,
-                            storage_nbit)(num_bits, B_quant_local[ki // num_elems_per_byte],
-                                          ki % num_elems_per_byte, in_dtype)
+                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(storage_type, storage_nbit)(
+                            num_bits, B_quant_local[ki // num_elems_per_byte], ki % num_elems_per_byte, in_dtype
+                        )
 
                 if use_dp4a:
                     for ki in T.serial(micro_size_k // dp4a_size):
@@ -137,9 +135,9 @@ def dequantize_gemv(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -149,7 +147,8 @@ def dequantize_gemv(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -160,11 +159,11 @@ def main() -> None:
     M = 1
     N = 1024
     K = 1024
-    in_dtype = "float16"
-    out_dtype = "float16"
-    accum_dtype = "float16"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    accum_dtype = T.float16
     num_bits = 4
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     source_format = "uint"
     n_partition = 4
     reduce_thread = 32
@@ -174,26 +173,39 @@ def main() -> None:
     group_size = -1
     with_scaling = False
 
-    kernel = dequantize_gemv(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits, storage_dtype,
-                             source_format, n_partition, reduce_thread, fast_decoding, trans_A,
-                             trans_B, group_size, with_scaling)
+    kernel = dequantize_gemv(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_bits,
+        storage_dtype,
+        source_format,
+        n_partition,
+        reduce_thread,
+        fast_decoding,
+        trans_A,
+        trans_B,
+        group_size,
+        with_scaling,
+    )
 
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     num_elems_per_byte = storage_nbit // num_bits
     A = torch.rand(M, K, dtype=getattr(torch, in_dtype)).cuda()
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
     C = torch.zeros(M, N, dtype=getattr(torch, accum_dtype)).cuda()
 
     if fast_decoding:
         from tilelang.quantize.utils import interleave_weight
+
         qB = interleave_weight(qB, num_bits, in_dtype)
     kernel(A, qB, C)
 
     # int4 reference
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for j in range(B.shape[1]):
         B[:, j] = ((qB[:, j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
 
diff --git a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
index c4cf5fb505ddbb31ec7a655bab3d73897a99eaa4..9921c6bfe2dcc9265fe4875d21835d93baef6b90 100644
--- a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
@@ -25,6 +25,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[128],
         block_N=[64, 128, 256],
@@ -33,33 +34,33 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           topk,
-           E,
-           padding_M,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=128,
-           block_N=256,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    topk,
+    E,
+    padding_M,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=128,
+    block_N=256,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
     Construct and return a grouped (Mixture-of-Experts) matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized, expert-grouped B (shape ExNxQK) and writes an output of shape (M, topk, N) in out_dtype.
 
@@ -82,8 +83,8 @@ def matmul(M,
         topk (int): number of experts selected per token.
         E (int): number of experts.
         padding_M (int): padded number of tokens after grouping and block alignment.
-        in_dtype (str): element type of A (e.g., "bfloat16").
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
+        in_dtype (str): element type of A (e.g., T.bfloat16).
+        out_dtype (str): output tensor element type (e.g., T.bfloat16).
         accum_dtype (str): accumulation type used for the inner GEMM.
         source_format (str, optional): format string passed to intrinsic selector (default "uint").
         num_bits (int, optional): number of bits per quantized element in B (default 4).
@@ -110,16 +111,17 @@ def matmul(M,
     """
 
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shared_shape = (block_M, block_K)
     B_shared_shape = (block_N, Block_QK)
-    Bias_shared_shape = (block_N)
+    Bias_shared_shape = block_N
     B_dequantize_shared_shape = (block_N, block_K)
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -135,7 +137,7 @@ def matmul(M,
     import_source = import_source
 
     # the dequant part is the same as in dequant_gemm
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
         The returned macro has signature (B_shared, B_dequantize_shared, Scale, k) and:
@@ -145,12 +147,12 @@ def matmul(M,
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -221,19 +223,16 @@ def matmul(M,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
-
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
-
             B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
             B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, out_dtype)
 
@@ -244,8 +243,8 @@ def matmul(M,
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
                 ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
@@ -254,19 +253,17 @@ def matmul(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((E, N, QK), storage_dtype),
-            Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
-            Bias: T.Tensor((E, N), out_dtype),
-            # Add fusedmoe tensors
-            topk_weights: T.Tensor((M * topk), out_dtype),
-            sorted_token_ids: T.Tensor((padding_M), "int32"),
-            expert_ids: T.Tensor((padding_M // block_M), "int32"),
-            C: T.Tensor((M, topk, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((E, N, QK), storage_dtype),
+        Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
+        Bias: T.Tensor((E, N), out_dtype),
+        # Add fusedmoe tensors
+        topk_weights: T.Tensor((M * topk), out_dtype),
+        sorted_token_ids: T.Tensor((padding_M), T.int32),
+        expert_ids: T.Tensor((padding_M // block_M), T.int32),
+        C: T.Tensor((M, topk, N), out_dtype),
     ):
-
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
             B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype)
@@ -274,23 +271,25 @@ def matmul(M,
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
             topk_weights_shared = T.alloc_shared((block_M), out_dtype)
-            sorted_token_ids_shared = T.alloc_shared((block_M), "int32")
-            expert_id = T.alloc_local((1), "int32")  # the expert id for the current block
+            sorted_token_ids_shared = T.alloc_shared((block_M), T.int32)
+            expert_id = T.alloc_local((1), T.int32)  # the expert id for the current block
             # To use 1D TMA, the last dim of Scale_shared must have stride=1
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
             T.use_swizzle(10)
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
-            T.copy(sorted_token_ids[by * block_M:(by + 1) * block_M], sorted_token_ids_shared)
+            T.copy(sorted_token_ids[by * block_M : (by + 1) * block_M], sorted_token_ids_shared)
             expert_id[0] = expert_ids[by]
 
             # Get the topk weights of each token in the current block
@@ -300,11 +299,11 @@ def matmul(M,
 
             # Get bias and scale based on the expert id
             if with_bias:
-                T.copy(Bias[expert_id[0], bx * block_N:(bx + 1) * block_N], Bias_shared)
+                T.copy(Bias[expert_id[0], bx * block_N : (bx + 1) * block_N], Bias_shared)
             else:
                 T.clear(Bias_shared)
 
-            T.copy(Scale[expert_id[0], bx * block_N:(bx + 1) * block_N, :], Scale_shared)
+            T.copy(Scale[expert_id[0], bx * block_N : (bx + 1) * block_N, :], Scale_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 C_local[i, j] = Bias_shared[j]
@@ -317,14 +316,13 @@ def matmul(M,
                     base = copy_i * threads * 16 + tx * 16
                     if sorted_token_ids_shared[base // block_K] != -1:
                         for copy_j in T.vectorized(16):
-                            A_shared[base // block_K, base % block_K +
-                                     copy_j] = A[sorted_token_ids_shared[base // block_K] // topk,
-                                                 k * block_K + base % block_K + copy_j]
+                            A_shared[base // block_K, base % block_K + copy_j] = A[
+                                sorted_token_ids_shared[base // block_K] // topk, k * block_K + base % block_K + copy_j
+                            ]
 
                 T.copy(B[expert_id[0], bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
+                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 else:
                     get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
 
@@ -338,16 +336,17 @@ def matmul(M,
                 base = copy_i * threads * 16 + tx * 16
                 if sorted_token_ids_shared[base // block_N] != -1:
                     for copy_j in T.vectorized(16):
-                        C[sorted_token_ids_shared[base // block_N] // topk,
-                          sorted_token_ids_shared[base // block_N] % topk, bx * block_N +
-                          base % block_N + copy_j] = C_shared[base // block_N,
-                                                              base % block_N + copy_j]
+                        C[
+                            sorted_token_ids_shared[base // block_N] // topk,
+                            sorted_token_ids_shared[base // block_N] % topk,
+                            bx * block_N + base % block_N + copy_j,
+                        ] = C_shared[base // block_N, base % block_N + copy_j]
 
     return main
 
 
 def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=256):
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     M, K = A.shape
     E, N, QK = qB.shape
     topk = topk_weights.shape[0] // M
@@ -355,7 +354,7 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
     assert scale_size == 32  # MXFP4
 
     # Initialize output tensor
-    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device='cuda')
+    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device="cuda")
 
     # Iterate over sorted_token_ids
     for idx in range(len(sorted_token_ids)):  # padding_M
@@ -370,14 +369,11 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
         # Dequantize the expert weights
         B = torch_convert_bit_twiddling(qB[expert_id])  # shape: (N, K)
-        B *= 2**(
-            Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(
-                torch.bfloat16))
+        B *= 2 ** (Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(torch.bfloat16))
 
         # Compute the output for this token-expert pair
         # token_embedding @ B.T + bias
-        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(
-            torch.bfloat16)) + Bias[expert_id]
+        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(torch.bfloat16)) + Bias[expert_id]
         output = output.to(torch.__getattribute__(dtypeC))
 
         # Apply the topk weight
@@ -391,14 +387,12 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
 
 def get_data(m, n, k, qk, scale_size, topk, E, block_M):
-    A = torch.empty(m, k, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-    qB = torch.randint(
-        0, 256, (E, n, qk), dtype=torch.uint8,
-        device='cuda')  #  Quantized weight tensor for E experts.
-    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device='cuda')
-    Bias = torch.empty(E, n, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-
-    weights = torch.empty(m, E, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
+    A = torch.empty(m, k, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+    qB = torch.randint(0, 256, (E, n, qk), dtype=torch.uint8, device="cuda")  #  Quantized weight tensor for E experts.
+    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device="cuda")
+    Bias = torch.empty(E, n, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+
+    weights = torch.empty(m, E, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
     # topk_weights: Router weights for the top-k experts for each token.
     # Shape: (m, topk)
     # tokens_experts: A flattened tensor of expert assignments for each token.
@@ -420,10 +414,7 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
         pad_len = ((cnt + block_M - 1) // block_M) * block_M - cnt
         if pad_len > 0:
             # -1 for padding (`M` instead in vLLM moe_align_block_size())
-            group_token_ids = torch.cat([
-                group_token_ids,
-                torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device='cuda')
-            ])
+            group_token_ids = torch.cat([group_token_ids, torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device="cuda")])
         padded_token_ids.append(group_token_ids)
         expert_ids.extend([eid] * ((cnt + block_M - 1) // block_M))
         start = end
@@ -431,21 +422,13 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
     # sorted_token_ids: The final flattened and padded tensor of token indices.
     sorted_token_ids = torch.cat(padded_token_ids, dim=0).to(torch.int32)  # (padding_M,)
     # expert_ids: The final tensor of expert IDs corresponding to `sorted_token_ids`.
-    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device='cuda')  # （padding_M,）
+    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device="cuda")  # （padding_M,）
     padding_M = sorted_token_ids.shape[0]  # padding_M: token number after padding
 
     return A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M
 
 
-def main(m=256,
-         n=256,
-         k=256,
-         scale_size=32,
-         topk=4,
-         E=32,
-         fast_dequant=True,
-         with_bias=False,
-         tune=False):
+def main(m=256, n=256, k=256, scale_size=32, topk=4, E=32, fast_dequant=True, with_bias=False, tune=False):
     # Tunable parameters
     block_M, block_N, block_K = 128, 256, 128  # noqa: F841
     num_stages = 1  # noqa: F841
@@ -456,8 +439,7 @@ def main(m=256,
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
     qk = k // num_elems_per_byte
-    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(
-        m, n, k, qk, scale_size, topk, E, block_M)
+    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(m, n, k, qk, scale_size, topk, E, block_M)
 
     if tune:
         with set_autotune_inputs([A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids]):
@@ -469,9 +451,9 @@ def main(m=256,
                 topk,
                 E,
                 padding_M,
-                "bfloat16",
-                "bfloat16",
-                "float32",
+                T.bfloat16,
+                T.bfloat16,
+                T.float32,
                 num_bits=num_bits,
                 scale_size=scale_size,
                 fast_dequant=fast_dequant,
@@ -485,9 +467,9 @@ def main(m=256,
             topk,
             E,
             padding_M,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=num_bits,
             scale_size=scale_size,
             fast_dequant=fast_dequant,
@@ -510,14 +492,11 @@ def main(m=256,
         expert_ids,
     )
 
-    print('Tilelang kernel run finished.')
+    print("Tilelang kernel run finished.")
 
-    ref_output = ref_moe(
-        A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids,
-        block_M=block_M)  # Maybe a little bit slow...
+    ref_output = ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=block_M)  # Maybe a little bit slow...
 
-    latency = tilelang.profiler.do_bench(
-        lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
+    latency = tilelang.profiler.do_bench(lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
     print("Tilelang: {:.2f} ms".format(latency))
     print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
@@ -525,32 +504,19 @@ def main(m=256,
     max_val = diff.max()
     max_idx = diff.argmax()
     print(f"max abs diff: {max_val} at index: {max_idx}")
-    assert_similar(
-        output, ref_output, name="output",
-        eps=2e-5)  # We care about the similarity rather than abs. difference
+    assert_similar(output, ref_output, name="output", eps=2e-5)  # We care about the similarity rather than abs. difference
     print("All checks pass. ✅")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--M", type=int, default=16384, help="M")  # From gpt-oss-20b MoE's first gemm
+    parser.add_argument("--M", type=int, default=16384, help="M")  # From gpt-oss-20b MoE's first gemm
     parser.add_argument("--N", type=int, default=5760, help="N")
     parser.add_argument("--K", type=int, default=2944, help="K")
     parser.add_argument("--scale_size", type=int, default=32, help="scale size")
-    parser.add_argument(
-        "--topk", type=int, default=4, help="topk")  # experts activated for each token
+    parser.add_argument("--topk", type=int, default=4, help="topk")  # experts activated for each token
     parser.add_argument("--E", type=int, default=32, help="E")  # number of experts
     parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
 
-    main(
-        args.M,
-        args.N,
-        args.K,
-        args.scale_size,
-        topk=args.topk,
-        E=args.E,
-        fast_dequant=True,
-        with_bias=True,
-        tune=args.tune)
+    main(args.M, args.N, args.K, args.scale_size, topk=args.topk, E=args.E, fast_dequant=True, with_bias=True, tune=args.tune)
diff --git a/examples/dsa_sparse_finetune/dsa.py b/examples/dsa_sparse_finetune/dsa.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fae8e5e3d698c9d7763b707fa2b2fd7506257c2
--- /dev/null
+++ b/examples/dsa_sparse_finetune/dsa.py
@@ -0,0 +1,223 @@
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from indexer_topk_reducesum import indexer_topk_reducesum_interface
+from indexer_bwd import indexer_bwd_interface
+from sparse_mla_fwd import sparse_mla_fwd_interface
+from sparse_mla_bwd import sparse_mla_bwd
+from sparse_mla_topk_reducesum import sparse_mla_topk_reducesum_interface
+from einops import einsum, repeat
+from utils import get_abs_err, get_err_ratio
+
+
+class RegsiterLossFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, loss):
+        ctx.save_for_backward(loss)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad):
+        loss = ctx.saved_tensors
+        return grad, torch.ones(1, dtype=loss[0].dtype, device=loss[0].device)
+
+
+register_loss = RegsiterLossFunction.apply
+
+
+def ref_deepseek_sparse_attention_innner(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    dtype = q.dtype
+    q, kv, index_q, index_k, weights = map(lambda x: x.to(torch.float32), (q, kv, index_q, index_k, weights))
+
+    index_sm_scale = index_q.shape[-1] ** -0.5
+    b, s = index_q.shape[:2]
+
+    # tl_topk_indices = tl_topk_indices.to(torch.int64)
+    # tl_topk_indices[tl_topk_indices == -1] = s
+
+    casual_mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+    index_logits = einsum(index_q, index_k, "b s1 h k, b s2 k -> b s1 h s2")
+    index_logits = F.relu(index_logits)
+    index_logits = (index_logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * index_sm_scale
+    index_logits = torch.where(casual_mask, index_logits, float("-inf"))
+    topk_indices = torch.topk(index_logits, k=topk, dim=-1).indices
+    topk_logits = torch.gather(F.pad(index_logits, (0, 1), value=float("-inf")), dim=-1, index=topk_indices)
+    topk_score = F.log_softmax(topk_logits, dim=-1, dtype=torch.float32)
+    index_topk_score = topk_score
+
+    if sm_scale is None:
+        sm_scale = kv.shape[-1] ** -0.5
+
+    h = q.shape[-2]
+    index_mask = torch.zeros((b, s, s + 1), dtype=torch.bool, device="cuda").scatter_(
+        dim=-1, index=topk_indices, src=torch.ones_like(topk_indices, dtype=torch.bool)
+    )[:, :, :-1]
+    mask = repeat(casual_mask & index_mask, "b s1 s2 -> b s1 h s2", h=h)
+    k, v = kv, kv[..., :dim_v]
+    logits = einsum(q, k, "b s1 h d, b s2 d -> b s1 h s2") * sm_scale
+    logits = torch.where(mask, logits, float("-inf"))
+    attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+    o = einsum(attn_score, v, "b s1 h s2, b s2 d -> b s1 h d")
+
+    attn_score = attn_score.sum(dim=-2)  # [b, s1, s2]
+    attn_topk_score = torch.gather(F.pad(attn_score, (0, 1)), dim=-1, index=topk_indices)
+    attn_topk_score = attn_topk_score / attn_topk_score.sum(dim=-1, keepdim=True)
+
+    loss = F.kl_div(index_topk_score.clip(-100, 0), attn_topk_score.detach().log().clip(-100, 0), log_target=True, reduction="sum")
+    o = register_loss(o, loss)
+
+    return o.to(dtype), topk_indices
+
+
+def ref_deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    all_o, all_topk_indices = [], []
+    for i in range(offsets.shape[0] - 1):
+        o, topk_indices = ref_deepseek_sparse_attention_innner(
+            q[None, offsets[i] : offsets[i + 1]],
+            kv[None, offsets[i] : offsets[i + 1]],
+            index_q[None, offsets[i] : offsets[i + 1]],
+            index_k[None, offsets[i] : offsets[i + 1]],
+            weights[None, offsets[i] : offsets[i + 1]],
+            topk,
+            dim_v,
+            sm_scale,
+            index_sm_scale,
+        )
+        all_o.append(o.squeeze(0))
+        all_topk_indices.append(topk_indices.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    return o, topk_indices
+
+
+class DSAFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        kv: torch.Tensor,
+        index_q: torch.Tensor,
+        index_k: torch.Tensor,
+        weights: torch.Tensor,
+        offsets: torch.Tensor,
+        topk: int,
+        dim_v: int,
+        sm_scale: Optional[float] = None,
+    ):
+        # topk_indices, index_score = ref_index_score(index_q, weights, index_k, topk)
+        topk_indices, index_score = indexer_topk_reducesum_interface(index_q, weights, index_k, topk, offsets)
+        o, lse = sparse_mla_fwd_interface(q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), offsets, sm_scale=sm_scale, d_v=dim_v)
+        ctx.save_for_backward(q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets)
+        ctx.topk = topk
+        ctx.dim_v = dim_v
+        ctx.sm_scale = sm_scale
+        return o, topk_indices
+
+    @staticmethod
+    def backward(
+        ctx,
+        do: torch.Tensor,
+        _1: torch.Tensor,
+    ):
+        q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets = ctx.saved_tensors
+        attn_score = sparse_mla_topk_reducesum_interface(
+            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), lse, offsets, dim_v=ctx.dim_v
+        ).squeeze(-2)
+        dq, dkv = sparse_mla_bwd(q, kv.unsqueeze(-2), o, do, topk_indices.unsqueeze(-2), lse, offsets, sm_scale=ctx.sm_scale)
+        dindex_q, dweights, dindex_k = indexer_bwd_interface(index_q, weights, index_k, attn_score, index_score, topk_indices, offsets)
+        return dq, dkv.squeeze(-2), dindex_q, dindex_k, dweights, None, None, None, None
+
+
+def deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+):
+    return DSAFunction.apply(q, kv, index_q, index_k, weights, offsets, topk, dim_v, sm_scale)
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    index_D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16().requires_grad_()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16().requires_grad_()
+    index_q = torch.randn((S, H, index_D)).cuda().bfloat16().requires_grad_()
+    weights = torch.randn((S, H)).cuda().bfloat16().requires_grad_()
+    index_k = torch.randn((S, index_D)).cuda().bfloat16().requires_grad_()
+    do = torch.randn((S, H, D)).cuda().bfloat16().requires_grad_()
+    offsets = torch.tensor([0, S // 2, S], dtype=torch.int32).cuda()
+
+    o, topk_indices = deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
+    o.backward(do)
+    q_grad, q.grad = q.grad, None
+    kv_grad, kv.grad = kv.grad, None
+    index_q_grad, index_q.grad = index_q.grad, None
+    index_k_grad, index_k.grad = index_k.grad, None
+    weights_grad, weights.grad = weights.grad, None
+
+    ref_o, ref_topk_indices = ref_deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
+    ref_o.backward(do)
+    ref_q_grad, q.grad = q.grad, None
+    ref_kv_grad, kv.grad = kv.grad, None
+    ref_index_q_grad, index_q.grad = index_q.grad, None
+    ref_index_k_grad, index_k.grad = index_k.grad, None
+    ref_weights_grad, weights.grad = weights.grad, None
+
+    print(f"o err: {get_abs_err(o, ref_o):.6f} ratio: {get_err_ratio(o, ref_o):.6f}")
+    print(f"q.grad err: {get_abs_err(q_grad, ref_q_grad):.6f} ratio: {get_err_ratio(q_grad, ref_q_grad):.6f}")
+    print(f"kv.grad err: {get_abs_err(kv_grad, ref_kv_grad):.6f} ratio: {get_err_ratio(kv_grad, ref_kv_grad):.6f}")
+    print(
+        f"index_q.grad err: {get_abs_err(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f} ratio: {get_err_ratio(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f}"
+    )
+    print(f"index_k.grad err: {get_abs_err(index_k_grad, ref_index_k_grad):.6f} ratio: {get_err_ratio(index_k_grad, ref_index_k_grad):.6f}")
+    print(f"weights.grad err: {get_abs_err(weights_grad, ref_weights_grad):.6f} ratio: {get_err_ratio(weights_grad, ref_weights_grad):.6f}")
+
+    intersections = []
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        mask = trt_np != -1
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+        intersections.append(len(intersection) / len(set_ref))
+    print("average intersections: {:.4f}".format(sum(intersections) / len(intersections)))
+
+
+test_kernel()
diff --git a/examples/dsa_sparse_finetune/index.py b/examples/dsa_sparse_finetune/index.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4800411004e5890faba0578cf83f09e27f2dc9
--- /dev/null
+++ b/examples/dsa_sparse_finetune/index.py
@@ -0,0 +1,82 @@
+# Modified from: https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/index.py
+import torch
+import torch.nn.functional as F
+import functools
+from typing import Callable, Any
+
+
+def tensor_cache(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: tuple | None = None
+    last_kwargs: dict | None = None
+    last_result: Any = None
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+
+        if (
+            (last_args is not None and last_kwargs is not None)
+            and (len(args) == len(last_args) and len(kwargs) == len(last_kwargs))
+            and all(a is b for a, b in zip(args, last_args, strict=False))
+            and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+        ):
+            return last_result
+
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_cu_seqlens_from_lens(
+    lens: torch.LongTensor,
+    dtype: torch.dtype | None = torch.int32,
+) -> torch.LongTensor:
+    return F.pad(lens.cumsum(dim=0, dtype=dtype), (1, 0))
+
+
+@tensor_cache
+def prepare_lens_from_cu_seqlens(
+    cu_seqlens: torch.LongTensor,
+) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_position_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.cat([torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device) for n in prepare_lens(cu_seqlens).unbind()])
+
+
+@tensor_cache
+def prepare_sequence_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return prepare_position_ids(cu_seqlens).eq(0).cumsum(0) - 1
+
+
+@tensor_cache
+def prepare_token_indices(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    position_ids = prepare_position_ids(cu_seqlens)
+    return torch.stack([prepare_sequence_ids(cu_seqlens), position_ids], 1).to(cu_seqlens)
diff --git a/examples/dsa_sparse_finetune/indexer_bwd.py b/examples/dsa_sparse_finetune/indexer_bwd.py
new file mode 100644
index 0000000000000000000000000000000000000000..68508ad4e45104b3b5717c95ef30ebfe1caaccd4
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_bwd.py
@@ -0,0 +1,254 @@
+import torch
+import torch.nn.functional as F
+from einops import einsum, repeat
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_bwd_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_I: int = 32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert num_stages == 0
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_I == 0
+    assert heads <= 64 and heads % 8 == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    dtype: str = BF16
+    accum_dtype: str = FP32
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    shape_p = [seq_len, topk]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.prim_func
+    def tl_indexer_bwd_kernel(
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        dIndexQ: T.Tensor(index_q_shape, dtype),
+        dWeights: T.Tensor(weights_shape, dtype),
+        dIndexK: T.Tensor(index_k_shape, dtype),
+        AttnScore: T.Tensor(shape_p, FP32),
+        IndexScore: T.Tensor(shape_p, FP32),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos = Offsets[i_b]
+            num_blocks = T.ceildiv(topk, block_I)
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            weights_shared = T.alloc_shared([heads], dtype=dtype)
+
+            d_index_q_frag = T.alloc_fragment([heads, dim], dtype=accum_dtype)
+            d_weights_frag = T.alloc_fragment([heads], dtype=accum_dtype)
+
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.copy(Weights[bos + i_t, :], weights_shared)
+            T.fill(d_index_q_frag, 0)
+            T.fill(d_weights_frag, 0)
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+
+            for bi_i in T.Pipelined(num_blocks, num_stages=num_stages):
+                i_st = bi_i * block_I
+                i_ed = (bi_i + 1) * block_I
+
+                indices_shared = T.alloc_shared([block_I], dtype=INT32)
+                T.copy(TopkIndices[bos + i_t, i_st:i_ed], indices_shared)
+
+                index_k_shared = T.alloc_shared([block_I, dim], dtype=dtype)
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    index_k_shared[i, j] = T.if_then_else((pos > -1) & (pos <= i_t), IndexK[bos + pos, j], 0)
+
+                attn_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                index_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                for i in T.Parallel(block_I):
+                    attn_score_shared[i] = AttnScore[bos + i_t, i_st + i]
+                    index_score_shared[i] = IndexScore[bos + i_t, i_st + i]
+
+                logits = T.alloc_fragment((block_I, heads), accum_dtype)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                for i, j in T.Parallel(block_I, heads):
+                    logits[i, j] = T.max(logits[i, j], 0)
+
+                # dw
+                d_weights_i = T.alloc_fragment((block_I, heads), accum_dtype)
+                for i, j in T.Parallel(block_I, heads):
+                    d_weights_i[i, j] = (index_score_shared[i] - attn_score_shared[i]) * logits[i, j]
+                T.reduce_sum(d_weights_i, d_weights_frag, dim=0, clear=False)
+
+                d_logits_qk = T.alloc_shared((block_I, heads), accum_dtype)
+                d_logits_qk_cast1 = T.alloc_fragment((block_I, heads), dtype)
+                d_logits_qk_cast2 = T.alloc_fragment((block_I, heads), dtype)
+
+                for i, j in T.Parallel(block_I, heads):
+                    d_relu = T.alloc_var(accum_dtype)
+                    if logits[i, j] > 0:
+                        d_relu = 1.0
+                    else:
+                        d_relu = 0.0
+                    d_logits_qk[i, j] = (index_score_shared[i] - attn_score_shared[i]) * d_relu * weights_shared[j]
+
+                # dq
+                T.copy(d_logits_qk, d_logits_qk_cast1)
+                T.gemm(
+                    d_logits_qk_cast1,  # [BS, HQ]
+                    index_k_shared,  # [BS, K]
+                    d_index_q_frag,  # [HQ, K]
+                    transpose_A=True,
+                    transpose_B=False,
+                    clear_accum=False,
+                )
+
+                # dk
+                T.copy(d_logits_qk, d_logits_qk_cast2)
+                d_index_k_frag = T.alloc_fragment([block_I, dim], dtype=accum_dtype)
+                T.gemm(
+                    d_logits_qk_cast2,  # [BS, HQ]
+                    index_q_shared,  # [HQ, K]
+                    d_index_k_frag,  # [BS, K]
+                    transpose_A=False,
+                    transpose_B=False,
+                    clear_accum=True,
+                )
+
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    if (pos > -1) & (pos <= i_t):
+                        T.atomic_add(dIndexK[bos + pos, j], d_index_k_frag[i, j])
+
+            for i, j in T.Parallel(heads, dim):
+                d_index_q_frag[i, j] = d_index_q_frag[i, j] * sm_scale
+
+            T.copy(d_index_q_frag, dIndexQ[bos + i_t, :, :])
+            T.copy(d_weights_frag, dWeights[bos + i_t, :])
+
+    return tl_indexer_bwd_kernel
+
+
+def indexer_bwd_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    attn_score: torch.Tensor,
+    index_score: torch.Tensor,
+    topk_indices: torch.Tensor,
+    offsets: torch.Tensor,
+):
+    _, heads, dim, topk = *q.shape, topk_indices.shape[-1]
+    token_indices = prepare_token_indices(offsets)
+    dq = torch.zeros_like(q)
+    dweights = torch.zeros_like(weights)
+    dk = torch.zeros_like(k)
+    kernel = tl_indexer_bwd_impl(heads, dim, topk)
+    kernel(q, weights, k, dq, dweights, dk, attn_score, index_score, topk_indices, offsets, token_indices)
+    return dq, dweights, dk
+
+
+def ref_indexer_bwd(
+    Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, AttnScore: torch.Tensor, offsets: torch.Tensor
+) -> torch.Tensor:
+    Q.requires_grad_(True)
+    Weights.requires_grad_(True)
+    K.requires_grad_(True)
+    softmax_scale = Q.shape[-1] ** -0.5
+    all_loss = []
+    all_log_topk_prob = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= TopkIndices.shape[-1]
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
+        attn_score = AttnScore[offsets[i] : offsets[i + 1]]
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2") * softmax_scale
+        logits = F.relu(logits)
+        score = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32)
+        score = torch.where(mask, score, float("-inf"))
+        topk_value = torch.gather(score, dim=-1, index=topk_indices.to(torch.int64))
+        log_topk_prob = F.log_softmax(topk_value, dim=-1, dtype=torch.float32)
+        loss = F.kl_div(log_topk_prob.clip(-100, 0), attn_score.log().clip(-100, 0), log_target=True, reduction="sum")
+        all_loss.append(loss)
+        all_log_topk_prob.append(log_topk_prob)
+    loss = torch.stack(all_loss).sum()
+    loss.backward()
+    log_topk_prob = torch.cat(all_log_topk_prob, dim=0)
+    return log_topk_prob.exp(), Q.grad, Weights.grad, K.grad
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    w = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    all_attn_score = []
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(topk)[None, :]).to(q.device)
+        logits = torch.ones(seq_len, topk).cuda()
+        logits = torch.where(mask, logits, float("-inf"))
+        attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        all_attn_score.append(attn_score)
+    attn_score = torch.cat(all_attn_score, dim=0)
+
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
+    index_score, ref_dq, ref_dw, ref_dk = ref_indexer_bwd(q, w, k, topk_indices, attn_score, offsets)
+
+    dq, dw, dk = indexer_bwd_interface(q, w, k, attn_score, index_score, topk_indices, offsets)
+
+    print(f"dq err: {get_abs_err(dq, ref_dq):.6f} ratio: {get_err_ratio(dq, ref_dq):.6f}")
+    print(f"dq err: {get_abs_err(dw, ref_dw):.6f} ratio: {get_err_ratio(dw, ref_dw):.6f}")
+    print(f"dq err: {get_abs_err(dk, ref_dk):.6f} ratio: {get_err_ratio(dk, ref_dk):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
new file mode 100644
index 0000000000000000000000000000000000000000..d76eb027247b9ce8fdf4cd20f422d7a79304eb3b
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
@@ -0,0 +1,273 @@
+import math
+import torch
+import torch.nn.functional as F
+from einops import einsum
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_topk_reducesum_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_K: int = 32,
+    dtype: str = FP32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_K == 0
+    assert heads <= 64 and heads % 8 == 0
+    assert num_stages == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    N = 2 * topk
+    num_iters = int(round(math.log2(N)))
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.macro
+    def bitonic_sort(
+        topk_index_shared: T.SharedBuffer([N], dtype=INT32),
+        topk_value_shared: T.SharedBuffer([N], dtype=FP32),
+    ):
+        T.sync_threads()
+        for i1 in T.serial(num_iters):
+            for i2 in T.serial(i1 + 1):
+                for i in T.Parallel(N):
+                    ascending = (i & (1 << (i1 + 1))) != 0
+                    j = i ^ (1 << (i1 - i2))
+                    if i < j and (
+                        (ascending and topk_value_shared[i] > topk_value_shared[j])
+                        or (not ascending and topk_value_shared[i] < topk_value_shared[j])
+                    ):
+                        val = topk_value_shared[i]
+                        topk_value_shared[i] = topk_value_shared[j]
+                        topk_value_shared[j] = val
+                        idx = topk_index_shared[i]
+                        topk_index_shared[i] = topk_index_shared[j]
+                        topk_index_shared[j] = idx
+                T.sync_threads()
+
+    @T.prim_func
+    def tl_indexer_topk_reducesum_kernel(
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        ReduceSum: T.Tensor(topk_indices_shape, FP32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos, eos = Offsets[i_b], Offsets[i_b + 1]
+            num_blocks = T.ceildiv(i_t + 1, block_K)
+
+            topk_index_shared = T.alloc_shared([N], dtype=INT32)
+            topk_value_shared = T.alloc_shared([N], dtype=FP32)
+
+            T.fill(topk_index_shared, -1)
+            T.fill(topk_value_shared, float("-inf"))
+            T.sync_threads()
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.sync_threads()
+
+            weights_frag = T.alloc_shared([heads], dtype=dtype)
+            T.copy(Weights[bos + i_t, :], weights_frag)
+            T.sync_threads()
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+            T.sync_threads()
+
+            for bk_i in T.Pipelined(num_blocks, num_stages=num_stages):
+                k_st = bk_i * block_K
+                k_ed = T.min((bk_i + 1) * block_K, eos - bos)
+
+                index_k_shared = T.alloc_shared([block_K, dim], dtype=dtype)
+                for i, j in T.Parallel(block_K, dim):
+                    index_k_shared[i, j] = T.if_then_else(k_st + i < k_ed, IndexK[bos + k_st + i, j], 0)
+                T.sync_threads()
+
+                logits = T.alloc_fragment((block_K, heads), FP32)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                T.sync_threads()
+
+                for i, j in T.Parallel(block_K, heads):
+                    logits[i, j] = T.max(logits[i, j], 0) * weights_frag[j]
+                T.sync_threads()
+
+                logits_sum = T.alloc_fragment(block_K, FP32)
+                T.reduce_sum(logits, logits_sum, dim=1)
+                T.sync_threads()
+
+                offset = T.alloc_var(INT32)
+                if k_st >= topk:
+                    offset = topk + (k_st % topk)
+                else:
+                    offset = k_st
+                T.sync_threads()
+                for i in T.Parallel(block_K):
+                    if k_st + i > i_t:
+                        logits_sum[i] = float("-inf")
+                    j = offset + i
+                    topk_index_shared[j] = k_st + i
+                    topk_value_shared[j] = logits_sum[i]
+                T.sync_threads()
+
+                if k_ed > topk and k_ed % topk == 0:
+                    bitonic_sort(topk_index_shared, topk_value_shared)
+
+            bitonic_sort(topk_index_shared, topk_value_shared)
+
+            logits_max_frag = T.alloc_fragment([1], dtype=FP32)
+            logits_frag = T.alloc_fragment([topk], dtype=FP32)
+            reducesum_shared = T.alloc_shared([topk], dtype=FP32)
+
+            T.copy(topk_value_shared[:topk], logits_frag)
+            T.sync_threads()
+
+            T.reduce_max(logits_frag, logits_max_frag, dim=-1)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                logits_frag[i] = T.exp(logits_frag[i] - logits_max_frag[0])
+            T.sync_threads()
+
+            lse_frag = T.alloc_fragment([1], dtype=FP32)
+            T.reduce_sum(logits_frag, lse_frag)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                reducesum_shared[i] = logits_frag[i] / lse_frag[0]
+            T.sync_threads()
+
+            # for i in T.Parallel(topk):
+            #     reducesum_shared[i] = logits_frag[i]
+            # T.sync_threads()
+
+            for i in T.Parallel(topk):
+                if topk_index_shared[i] > i_t:
+                    topk_index_shared[i] = -1
+            T.sync_threads()
+
+            T.copy(topk_index_shared[:topk], TopkIndices[bos + i_t, :])
+            T.copy(reducesum_shared[:topk], ReduceSum[bos + i_t, :])
+
+    return tl_indexer_topk_reducesum_kernel
+
+
+def indexer_topk_reducesum_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    topk: int,
+    offsets: torch.Tensor,
+    dtype: str = BF16,
+):
+    seq_len, heads, dim = q.shape
+    kernel = tl_indexer_topk_reducesum_impl(heads=heads, dim=dim, topk=topk, dtype=dtype)
+    token_indices = prepare_token_indices(offsets)
+    topk_indices = torch.zeros((seq_len, topk), device=q.device, dtype=torch.int32)
+    topk_score = torch.zeros((seq_len, topk), device=q.device, dtype=torch.float32)
+    kernel(q, weights, k, topk_indices, topk_score, offsets, token_indices)
+    return topk_indices, topk_score
+
+
+def ref_index_score(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, topk: int, offsets: torch.Tensor) -> torch.Tensor:
+    all_topk_indices = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= topk
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        softmax_scale = q.shape[-1] ** -0.5
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2")
+        logits = F.relu(logits)
+        logits = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * softmax_scale
+        logits = torch.where(mask, logits, float("-inf"))
+        topk_logits, topk_indices = torch.topk(logits, k=topk, dim=-1)
+        topk_score = F.softmax(topk_logits, dim=-1, dtype=torch.float32)
+        all_topk_indices.append(topk_indices)
+        all_topk_score.append(topk_score)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return topk_indices, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=64,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    weights = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, S], dtype=torch.int32).cuda()
+
+    ref_topk_indices, ref_topk_score = ref_index_score(q, weights, k, topk, offsets)
+
+    topk_indices, topk_score = indexer_topk_reducesum_interface(q, weights, k, topk, offsets)
+
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        ref_np_val = ref_topk_score[j]
+        trt_np_val = topk_score[j]
+
+        mask = (ref_np_val > 0).cpu().numpy()
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+
+        print("idx:", j, "selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
+
+        print(f"err: {get_abs_err(ref_np_val, trt_np_val):.6f} ratio: {get_err_ratio(ref_np_val, trt_np_val):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/sparse_mla_bwd.py b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b76dbca1c5fa483f57399e701a17bff870edd80
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
@@ -0,0 +1,354 @@
+# ruff: noqa
+import tilelang
+from tilelang import language as T
+import torch
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(out_idx=[-1])
+def preprocess(
+    H,
+    D,
+    block_ND=32,
+    num_stages=5,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+
+    S = T.symbolic("S")
+
+    shape = [S, H, D]
+
+    @T.prim_func
+    def preprocess_kernel(
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([S, H], accum_dtype),
+    ):
+        with T.Kernel(H, T.ceildiv(S, block_ND)) as (bx, by):
+            o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            do = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            delta = T.alloc_fragment([block_ND], accum_dtype)
+            acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            T.clear(acc)
+            for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
+                T.copy(O[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
+                for i, j in T.Parallel(block_ND, block_ND):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+            T.copy(delta, Delta[by * block_ND : (by + 1) * block_ND, bx])
+
+    return preprocess_kernel
+
+
+@tilelang.jit(out_idx=[-1])
+def postprocess(
+    D,
+    D_tail,
+    kv_group=1,
+    block_N=64,
+    threads=128,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    S_kv = T.symbolic("S_kv")
+
+    dkv_shape = [S_kv, kv_group, D + D_tail]
+
+    @T.prim_func
+    def postprocess_kernel(
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, threads=threads) as (bx, by):
+            T.copy(
+                dKV[bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bx * block_N : (bx + 1) * block_N, by, :],
+            )
+
+    return postprocess_kernel
+
+
+@tilelang.jit(
+    out_idx=[-2],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def bwd(
+    H,
+    D,
+    D_tail,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    block_size=32,
+    num_stages=0,
+    threads=128,
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
+
+    if sm_scale is None:
+        sm_scale = (D + D_tail) ** (-0.5)
+
+    B_plus_one = T.symbolic("B_plus_one")
+    S = T.symbolic("S")
+
+    H_kv = H // kv_group
+    q_shape = [S, H, D + D_tail]
+    k_shape = [S, kv_group, D + D_tail]
+    o_shape = [S, H, D]
+    indices_shape = [S, kv_group, topk]
+    delta_shape = [S, H]
+    lse_shape = [S, H]
+    offsets_shape = [B_plus_one]
+    token_indices_shape = [S, 2]
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+
+    H = H_kv
+    padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
+    BS = block_size
+    NS = tilelang.cdiv(topk, block_size)
+
+    split_store = 2
+
+    @T.prim_func
+    def sparse_mla_bwd_kernel(
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        Offsets: T.Tensor(offsets_shape, indices_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
+    ):
+        with T.Kernel(S, kv_group, threads=threads) as (b_s_i, bz):
+            Q_shared = T.alloc_shared([padded_H, D], dtype)
+            Q_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+            KV_shared = T.alloc_shared([BS, D], dtype)
+            KV_tail_shared = T.alloc_shared([BS, D_tail], dtype)
+            dO_shared = T.alloc_shared([padded_H, D], dtype)
+            mask = T.alloc_fragment([BS], "bool")
+
+            P_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dP_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dQ_shared = T.alloc_shared([padded_H, D], dtype)
+            dQ_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+
+            acc_p = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dp = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dq = T.alloc_fragment([padded_H, D], accum_dtype)
+            acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
+            acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
+            acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
+            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
+            acc_dkv_tail_shared = T.view(KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+
+            max_kv_i = s_i
+
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], Q_shared)
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:], Q_tail_shared)
+            T.copy(dO[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], dO_shared)
+
+            T.clear(acc_dq)
+            T.clear(acc_dq_tail)
+
+            T.annotate_layout(
+                {
+                    dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
+                    dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
+                }
+            )
+
+            # Process each block of indices
+            for i_i in T.Pipelined(NS, num_stages=num_stages):
+                # Check which indices are valid
+                for bi_i in T.Parallel(BS):
+                    mask[bi_i] = (Indices[bos + s_i, bz, i_i * BS + bi_i] <= max_kv_i) & (Indices[bos + s_i, bz, i_i * BS + bi_i] != -1)
+
+                # Compute attention scores
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_p.dtype))
+
+                # Load KV, V for this block of indices
+                for bi_i, d_i in T.Parallel(BS, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, d_i]
+
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for bi_i, d_i in T.Parallel(BS, D_tail):
+                    KV_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.exp(acc_p[h_i, bi_i] * sm_scale - Lse[bos + s_i, bz * padded_H + h_i])
+
+                T.copy(acc_p, P_shared_cast)
+
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[bos + s_i, bz * padded_H + h_i]) * sm_scale
+
+                T.copy(acc_dp, dP_shared_cast)
+                T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
+
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
+
+                T.clear(acc_dkv_tail)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for s in range(split_store):
+                    for bi_i, d_i in T.Parallel(BS, D):
+                        if bi_i < BS // split_store:
+                            acc_dkv_shared[bi_i, d_i] = acc_dkv[bi_i + s * (BS // split_store), d_i]
+
+                    for bi_i, d_i in T.Parallel(BS, D_tail):
+                        if bi_i < BS // split_store:
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
+
+                    for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
+                        T.atomic_addx4(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4],
+                        )
+
+                    # Atomically update dKV, dKV_tail tensors
+                    for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
+                        T.atomic_addx4(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                        )
+
+            # Store the accumulated dQ
+            T.copy(acc_dq, dQ_shared)
+            T.copy(acc_dq_tail, dQ_tail_shared)
+
+            T.copy(dQ_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D])
+            T.copy(dQ_tail_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:])
+
+    return sparse_mla_bwd_kernel
+
+
+def sparse_mla_bwd(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
+    assert q.is_contiguous()
+    assert kv.is_contiguous()
+    assert indices.is_contiguous()
+    assert lse.is_contiguous()
+    S, H, dim_plus_tail_dim = q.shape
+    S_kv, kv_group, _ = kv.shape
+    assert kv.shape[-1] == dim_plus_tail_dim
+    assert S == S_kv
+    # dim should be assigned
+    D = 512
+
+    D_tail = dim_plus_tail_dim - D
+    topk = indices.shape[-1]
+    assert indices.shape == (S, kv_group, topk)
+    assert lse.shape == (S, H)
+
+    token_indices = prepare_token_indices(offsets)
+
+    # Get kernels
+    preprocess_kernel = preprocess(H, D)
+    bwd_kernel = bwd(H, D, D_tail, topk, kv_group, sm_scale, is_casual)
+    postprocess_kernel = postprocess(D, D_tail, kv_group)
+
+    if delta is None:
+        delta = preprocess_kernel(o, do)
+    dkv = torch.zeros_like(kv, dtype=torch.float32)
+    dq = bwd_kernel(q, kv, do, indices, lse, delta, offsets, token_indices, dkv)
+    dkv = postprocess_kernel(dkv)
+
+    return dq, dkv
+
+
+def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True):
+    from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
+    q = q.detach().clone()
+    kv = kv.detach().clone()
+    q.requires_grad = True
+    kv.requires_grad = True
+    o = ref_sparse_mla_fwd_interface(q, kv, indices, offsets, sm_scale, is_casual)
+    o.backward(do)
+    return q.grad, kv.grad
+
+
+def test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True):
+    # Prepare data
+    q = torch.randn((S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((S, H, DV), dtype=dtype, device="cuda")
+    offsets = torch.tensor([0, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
+
+    # Forward
+    from sparse_mla_fwd import sparse_mla_fwd_interface
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets)
+
+    tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+    ref_dq, ref_dkv = ref_sparse_mla_bwd_interface(q, kv, None, do, indices, None, offsets)
+
+    if check_correctness:
+        assert_tensors_similar(tl_dq, ref_dq, eps=1e-4, name="dq")
+        assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
+        print("assert_tensors_similar passed")
+
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
+    from tilelang.profiler import do_bench
+
+    def fn():
+        return sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+
+    ms = do_bench(fn, rep=100, warmup=250)
+    print(f"Average time: {ms:.3f} ms")
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_fwd.py b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
new file mode 100644
index 0000000000000000000000000000000000000000..d87523695240ce3029c29e84c10c50cbfc4a39c8
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
@@ -0,0 +1,310 @@
+# ruff: noqa
+import torch
+import tilelang
+from tilelang import language as T
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(
+    out_idx=[-2, -1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def sparse_mla_fwd(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    CP0=True,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
+    else:
+        sm_scale = sm_scale
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    head_kv = heads // kv_group
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len, kv_group, dim + tail_dim]
+    o_shape = [seq_len, heads, dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_o = T.alloc_fragment([H_per_block, D], accum_dtype)
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            S_shared = T.alloc_shared([H_per_block, BI], dtype)
+            sumexp = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i = T.alloc_fragment([H_per_block], accum_dtype)
+            alpha = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(acc_o, 0)
+            T.fill(sumexp, 0)
+            T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.copy(m_i, m_i_prev)
+                T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    alpha[h_i] = T.exp((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale)
+                T.reduce_sum(acc_s, sumexp_i, dim=1)  # is this a accumulate operator?
+                for h_i in T.Parallel(H_per_block):
+                    sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i]
+                for h_i, d_i in T.Parallel(H_per_block, D):
+                    acc_o[h_i, d_i] = acc_o[h_i, d_i] * alpha[h_i]
+
+                T.copy(acc_s, S_shared)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            # Rescale
+            for h_i, d_i in T.Parallel(H_per_block, D):
+                acc_o[h_i, d_i] /= sumexp[h_i]
+            for h_i in T.Parallel(H_per_block):
+                sumexp[h_i] = T.log(sumexp[h_i]) + m_i[h_i] * sm_scale
+
+            T.copy(acc_o, Output[bos + s_i, H0:H1, :])
+            T.copy(sumexp, Lse[bos + s_i, H0:H1])
+
+    return main
+
+
+def sparse_mla_fwd_interface(
+    q, kv, indices, offsets, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=32, num_stages=2, threads=128
+):
+    is_casual = True
+    assert return_p_sum == False, "This kernel file is for fwd only"
+    assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
+    seq_len, heads, dim_plus_tail_dim = q.shape
+    seq_len_kv, kv_group, _ = kv.shape
+    assert seq_len == seq_len_kv
+
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
+    dim = d_v
+
+    assert kv.shape[-1] == dim_plus_tail_dim
+    tail_dim = dim_plus_tail_dim - dim
+    _, _, topk = indices.shape
+    assert indices.shape == (seq_len, kv_group, topk)
+
+    token_indices = prepare_token_indices(offsets)
+
+    kernel = sparse_mla_fwd(
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
+    out, lse = kernel(q, kv, indices, offsets, token_indices)
+    return out, lse
+
+
+def ref_sparse_mla_fwd_interface(Q, KV, Indices, offsets, sm_scale=None, is_casual=True):
+    Q = Q.float()
+    KV = KV.float()
+    all_o = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[None, offsets[i] : offsets[i + 1]]
+        kv = KV[None, offsets[i] : offsets[i + 1]]
+        indices = Indices[None, offsets[i] : offsets[i + 1]].clone()
+
+        indices = indices.transpose(1, 2)
+        b, sq, h, dim_q = q.shape
+        b, sk, g, _ = kv.shape
+
+        assert kv.shape[-1] == 576, "you should assign dim otherwise"
+        dim = 512
+        k = kv
+        v = kv[..., :dim]
+
+        b, _, _, dim_v = v.shape
+        g_index = g
+        h_index = h // g
+        compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+        ).view(1, -1)
+
+        indices[indices > sk] = sk
+        mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
+        mask = mask[..., :-1]
+        mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
+        mask[:, :, : 1 - 1, 0] = True
+        mask = mask.view(b, g_index, 1, sq, sk)
+
+        q = q.view(b, sq, g, -1, dim_q)
+        score = torch.einsum("bmghd,bngd->bghmn", q, k)
+        sm_scale = dim_q**-0.5 if sm_scale is None else sm_scale
+        score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+        p = score.softmax(dim=-1)
+        p = p.view(b, g_index, h_index, -1, sq, sk)
+        p = p.view(b, g, -1, sq, sk)
+        o = torch.einsum("bghmn,bngd->bmghd", p.type(v.dtype), v)
+        o = o.reshape(b, sq, h, dim_v)
+        all_o.append(o.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    return o.to(torch.bfloat16)
+
+
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
+    torch.random.manual_seed(0)
+    q = torch.randn((S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    offsets = torch.tensor([0, S // 2 - 1, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    if check_correctness:
+        # otherwise may cause out of memory
+        ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, offsets)
+        assert_tensors_similar(tl_out, ref_out, eps=1e-2, name="out")
+        print("assert_tensors_similar passed")
+
+    def fn():
+        return sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    from tilelang.profiler import do_bench
+
+    ms = do_bench(
+        fn,
+        rep=100,
+        warmup=250,
+    )
+    print(f"Average time: {ms:.3f} ms")
+    print("fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print("fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_fwd(
+        B=1,
+        S=4096,
+        H=128,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=1024,
+        dtype=torch.bfloat16,
+        check_correctness=True,
+        block_I=64,
+        num_stages=2,
+        threads=256,
+    )
diff --git a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
new file mode 100644
index 0000000000000000000000000000000000000000..a03bc74f51e254b8cd9232eebc91bc9c6f0fa4c9
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
@@ -0,0 +1,226 @@
+# ruff: noqa
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tilelang
+from tilelang import language as T
+from einops import repeat, rearrange, einsum
+from index import prepare_token_indices
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def tl_sparse_mla_topk_reducesum_impl(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    seq_len_kv = T.symbolic("seq_len_kv")
+
+    head_kv = heads // kv_group
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len_kv, kv_group, dim + tail_dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    reducesum_shape = [seq_len, kv_group, REPLICATE_H, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    @T.prim_func
+    def tl_sparse_mla_topk_reducesum_kernel(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        ReduceSum: T.Tensor(reducesum_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            reducesum = T.alloc_fragment([BI], accum_dtype)
+            lse = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(lse, 0)
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            r_i = bx % REPLICATE_H
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+            T.copy(Lse[bos + s_i, H0:H1], lse)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - lse[h_i])
+                T.reduce_sum(acc_s, reducesum, dim=0)
+                T.copy(reducesum, ReduceSum[bos + s_i, g_i, r_i, i_i * BI : i_i * BI + BI])
+
+    return tl_sparse_mla_topk_reducesum_kernel
+
+
+def sparse_mla_topk_reducesum_interface(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    topk_indices: torch.Tensor,
+    lse: torch.Tensor,
+    offsets: torch.Tensor,
+    dim_v: int,
+):
+    assert kv.shape[-2] == 1
+    seq_len, heads, dim_plus_tail_dim, topk = *q.shape, topk_indices.shape[-1]
+    REPLICATE_H = max(heads // 64, 1)
+    tail_dim = dim_plus_tail_dim - dim_v
+    token_indices = prepare_token_indices(offsets)
+
+    reducesum = torch.zeros([seq_len, 1, REPLICATE_H, topk], dtype=torch.float32, device=q.device)
+    kernel = tl_sparse_mla_topk_reducesum_impl(heads=heads, dim=dim_v, tail_dim=tail_dim, topk=topk)
+    kernel(q, kv, topk_indices, lse, offsets, token_indices, reducesum)
+    reducesum = reducesum.sum(dim=-2)  # [batch, seq_len, 1, RH, topk] -> [batch, seq_len, 1, topk]
+    attn_score = reducesum / reducesum.sum(dim=-1, keepdim=True)
+
+    return attn_score
+
+
+def ref_mla_topk_softmax(Q: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, offsets: torch.Tensor):
+    # q: [batch, seq_len, heads, dim]
+    # k: [batch, seq_len, dim]
+    sm_scale = Q.shape[-1] ** -0.5
+    all_lse = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
+        seq_len = q.shape[0]
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(seq_len)[None, :]).unsqueeze(-2).cuda()
+        logits = einsum(q, k, "s1 h d, s2 d -> s1 h s2") * sm_scale
+        logits = torch.where(mask, logits, float("-inf"))
+        score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        score_sum = score.sum(dim=-2)
+        topk_score = torch.gather(score_sum, dim=-1, index=topk_indices.to(torch.int64))
+        topk_score = topk_score / topk_score.sum(dim=-1, keepdim=True)
+        max_logits = logits.amax(dim=-1).to(torch.float32)
+        lse = torch.log((logits - max_logits.unsqueeze(-1).to(torch.float32)).exp().sum(dim=-1)) + max_logits
+        all_lse.append(lse)
+        all_topk_score.append(topk_score)
+    lse = torch.cat(all_lse, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return lse, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    topk=128,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
+
+    lse, ref_attn_score = ref_mla_topk_softmax(q, kv, topk_indices, offsets)
+
+    kv = kv.unsqueeze(-2)
+    topk_indices = topk_indices.unsqueeze(-2)
+
+    attn_score = sparse_mla_topk_reducesum_interface(q, kv, topk_indices, lse, offsets, dim_v=D).squeeze(-2)
+    print(f"attn_score err: {get_abs_err(attn_score, ref_attn_score):.6f} ratio: {get_err_ratio(attn_score, ref_attn_score):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/utils.py b/examples/dsa_sparse_finetune/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..96afd064dc0f83f0e813fa4093f10d2fd309dfce
--- /dev/null
+++ b/examples/dsa_sparse_finetune/utils.py
@@ -0,0 +1,73 @@
+import torch
+
+
+def get_abs_err(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    return (x - y).flatten().abs().max().item()
+
+
+def get_err_ratio(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    err = (x - y).flatten().square().mean().sqrt().item()
+    base = (x).flatten().square().mean().sqrt().item()
+    return err / base
+
+
+def calculate_tensor_similarity(x, y, name="tensor"):
+    """
+    Calculate similarity between two tensors using a normalized dot product metric.
+
+    Unlike torch.testing.assert_close which uses absolute/relative tolerance based on
+    element-wise differences, this function computes a global similarity score:
+        sim = 2 * <x, y> / (||x||^2 + ||y||^2)
+
+    This metric is scale-invariant and measures the cosine-like similarity normalized
+    by the magnitude of both tensors. It returns 1 for identical tensors and values
+    closer to 0 for dissimilar ones. This is particularly useful for comparing tensors
+    with varying magnitudes where relative errors matter more than absolute differences.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        name: Name of the tensor for logging purposes
+
+    Returns:
+        Similarity score in range [0, 1] where 1 means identical
+    """
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print(f"\033[33mWARNING: {name} all zero\033[0m")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
+    """
+    Assert that two tensors are similar using a global similarity metric.
+
+    Key differences from torch.testing.assert_close:
+    - torch.testing.assert_close: Uses element-wise comparison with rtol/atol, checking
+      that |x - y| <= atol + rtol * |y| for each element. It's sensitive to outliers
+      and requires all elements to satisfy the tolerance.
+    - assert_tensors_similar: Uses a single global similarity score (1 - sim) where sim is the
+      normalized dot product. It's more robust to outliers and focuses on overall
+      tensor similarity rather than element-wise precision. This is better suited for
+      comparing large tensors where a few outlier elements shouldn't fail the test.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        eps: Maximum allowed difference (1 - similarity), default 1e-8
+        name: Name of the tensor for error messages
+        raise_assert: Whether to raise assertion error on failure
+    """
+    sim = calculate_tensor_similarity(x, y, name)
+    diff = 1.0 - sim
+    if not (0 <= diff <= eps):
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
+        if raise_assert:
+            assert False  # noqa: B011
diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
deleted file mode 100644
index be018c8b70c76e8ac22d4c6154aca5acc0324a2c..0000000000000000000000000000000000000000
--- a/examples/dynamic_shape/example_dynamic.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import tilelang
-import tilelang.language as T
-import tilelang.testing
-from tilelang import tvm as tvm
-
-
-@tilelang.jit(pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8})
-def matmul_dynamic_mnk(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def dynamic_matmul(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return dynamic_matmul
-
-
-def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads):
-    print(
-        f"M: {M}, N: {N}, K: {K}, block_M: {block_M}, block_N: {block_N}, block_K: {block_K}, trans_A: {trans_A}, trans_B: {trans_B}, in_dtype: {in_dtype}, out_dtype: {out_dtype}, accum_dtype: {accum_dtype}, num_stages: {num_stages}, threads: {threads}"
-    )
-    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                                accum_dtype, num_stages, threads)
-
-    import torch
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-    print("Kernel output matches PyTorch reference.")
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Latency: {latency} ms")
-
-
-def main(M=16384, N=16384, K=16384):
-    block_M, block_N, block_K = 128, 128, 32
-    trans_A, trans_B = False, False
-    in_dtype, out_dtype = "float16", "float16"
-    accum_dtype = "float32"
-    num_stages = 3
-    threads = 128
-    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/dynamic_shape/test_example_dynamic.py b/examples/dynamic_shape/test_example_dynamic.py
deleted file mode 100644
index 36a3743f1ad60503387df9c05e818ad0e51422d2..0000000000000000000000000000000000000000
--- a/examples/dynamic_shape/test_example_dynamic.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import tilelang.testing
-import example_dynamic
-
-
-def test_example_dynamic():
-    example_dynamic.main(M=1024, N=1024, K=1024)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
index bc9bb4df5bbc9daf13a963d5714478c8d5b54f25..f075c64fd669ff4ce15ae518b3e17998aad8edae 100644
--- a/examples/elementwise/example_elementwise_add.py
+++ b/examples/elementwise/example_elementwise_add.py
@@ -3,19 +3,25 @@ import itertools
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.autotuner import AutoTuner
 
 
 def ref_program(x, y):
     return x + y
 
 
+def get_configs():
+    block_M = [64, 128, 256]
+    block_N = [64, 128, 256]
+    threads = [64, 128, 256]
+    configs = list(itertools.product(block_M, block_N, threads))
+    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
+
+
+@tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -24,7 +30,7 @@ def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
@@ -32,53 +38,25 @@ def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
     return elem_add
 
 
-def get_configs(M, N):
-    block_M = [64, 128, 256]
-    block_N = [64, 128, 256]
-    threads = [64, 128, 256]
-    configs = list(itertools.product(block_M, block_N, threads))
-    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
-
-
-def get_best_config(M, N):
-
-    def kernel(block_M=None, block_N=None, threads=None):
-        return elementwise_add(M, N, block_M, block_N, "float32", "float32", threads)
-
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N)).set_compile_args(
-            out_idx=[-1],
-            target="cuda",
-        ).set_profile_args(
-            supply_type=tilelang.TensorSupplyType.Auto,
-            ref_prog=ref_program,
-            skip_check=False,
-        )
-    return autotuner.run(warmup=3, rep=20)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--m", type=int, default=1024)
-    parser.add_argument("--n", type=int, default=1024)
-    parser.add_argument("--use_autotune", action="store_true", default=False)
-    args, _ = parser.parse_known_args()
-    M, N = args.m, args.n
-
+def main(M=1024, N=1024, use_autotune=False):
     a = torch.randn(M, N, dtype=torch.float32, device="cuda")
     b = torch.randn(M, N, dtype=torch.float32, device="cuda")
 
-    if args.use_autotune:
-        result = get_best_config(M, N)
-        kernel = result.kernel
+    if use_autotune:
+        kernel = elementwise_add(M, N, in_dtype=T.float32, out_dtype=T.float32)
     else:
         # Default config
         config = {"block_M": 32, "block_N": 32, "threads": 128}
-        kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+        kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
 
     out = kernel(a, b)
     torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--m", type=int, default=1024)
+    parser.add_argument("--n", type=int, default=1024)
+    parser.add_argument("--use_autotune", action="store_true", default=False)
+    args, _ = parser.parse_known_args()
+    main(args.m, args.n, args.use_autotune)
diff --git a/examples/elementwise/test_example_elementwise.py b/examples/elementwise/test_example_elementwise.py
index f1668f4aa645bf1c8b530195f323675451555f1f..24f675cd6a3778280ce1a52c1b6e6ca54aa8393c 100644
--- a/examples/elementwise/test_example_elementwise.py
+++ b/examples/elementwise/test_example_elementwise.py
@@ -6,5 +6,9 @@ def test_example_elementwise_add():
     example_elementwise_add.main()
 
 
+def test_example_elementwise_add_autotune():
+    example_elementwise_add.main(use_autotune=True)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/flash_attention/README.md b/examples/flash_attention/README.md
index be11a8dc64aea5aa6fe93eb46f68268302715088..633727ec4e9270b66176db82d3e13f430895c33a 100644
--- a/examples/flash_attention/README.md
+++ b/examples/flash_attention/README.md
@@ -77,7 +77,9 @@ def flash_attention(
 
             # Compute the maximum value per row on dimension 1 (block_N)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                
             # Compute the factor by which we need to rescale previous partial sums
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] - scores_max[i])
diff --git a/examples/flash_attention/bert_padding.py b/examples/flash_attention/bert_padding.py
index 7058fd773d6a1250eac24083a124e8c98543028c..15c4097ce77a21ebcd2060b53c629e7a89972b88 100644
--- a/examples/flash_attention/bert_padding.py
+++ b/examples/flash_attention/bert_padding.py
@@ -6,7 +6,6 @@ from einops import rearrange, repeat
 
 
 class IndexFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -15,9 +14,7 @@ class IndexFirstAxis(torch.autograd.Function):
         second_dim = other_shape.numel()
         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
-        return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0,
-            repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
+        return torch.gather(rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -40,14 +37,12 @@ index_first_axis = IndexFirstAxis.apply
 
 
 class IndexPutFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, values, indices, first_axis_dim):
         ctx.save_for_backward(indices)
         assert indices.ndim == 1
         assert values.ndim >= 2
-        output = torch.zeros(
-            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
+        output = torch.zeros(first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         output[indices] = values
         # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
@@ -66,7 +61,6 @@ index_put_first_axis = IndexPutFirstAxis.apply
 
 
 class IndexFirstAxisResidual(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -128,7 +122,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
     The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
-    
+
     For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
         ```
         [
@@ -177,9 +171,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(
-        seqlen, device=length.device, dtype=length.dtype).expand(len(length),
-                                                                 seqlen) < length.unsqueeze(1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
     real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index 907a121d264f5ce2759f2767c1b052071d34f2cb..89c1166693c672a8fd0021419837c950a0651df9 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -6,25 +6,27 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -40,25 +42,25 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -72,29 +74,31 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -103,81 +107,74 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim_qk):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_qk]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -197,35 +194,36 @@ def flashattn_bwd_atomic_add(batch,
             dk_shared = T.alloc_shared([block_M, dim_qk], accum_dtype)
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -237,49 +235,41 @@ def flashattn_bwd_atomic_add(batch,
                 for i, j in T.Parallel(block_N, dim_qk):
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        heads,
-                        seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -299,37 +289,38 @@ def flashattn_bwd_split(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -342,16 +333,15 @@ def flashattn_bwd_split(batch,
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -369,7 +359,10 @@ class _attention(torch.autograd.Function):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -386,17 +379,8 @@ class _attention(torch.autograd.Function):
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -409,17 +393,8 @@ class _attention(torch.autograd.Function):
             dv = dv.to(torch.float16)
         else:
             kernel = flashattn_bwd_split(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -441,53 +416,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -504,7 +471,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -524,17 +491,15 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -546,5 +511,4 @@ if __name__ == "__main__":
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index 615c2e191d7573b93b93810ba4890cb2a6a6db09..07586f99fdd7d55d52a59359337c423dcca96a6f 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -9,25 +9,27 @@ tilelang.disable_cache()
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -43,27 +45,27 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
             # We should set it to negative large number instead
             T.fill(scores_max, T.Cast(accum_dtype, -1e30))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     T.Cast(accum_dtype, -1e30))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, T.Cast(accum_dtype, -1e30))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -77,29 +79,31 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -108,12 +112,12 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
@@ -124,12 +128,14 @@ def make_dq_layout(dQ):
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
@@ -137,64 +143,55 @@ def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bz, bx * blk:(bx + 1) * blk, by, :], dQ_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.copy(dQ[bz, bx * blk : (bx + 1) * blk, by, :], dQ_out[bz, bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(seq_len, blk), head_kv, batch, threads=128) as (bx, by, bz):
-            T.annotate_layout({
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-            })
-            T.copy(dK[bz, bx * blk:(bx + 1) * blk, by, :], dK_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
-            T.copy(dV[bz, bx * blk:(bx + 1) * blk, by, :], dV_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bz, bx * blk : (bx + 1) * blk, by, :], dK_out[bz, bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bz, bx * blk : (bx + 1) * blk, by, :], dV_out[bz, bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -215,37 +212,38 @@ def flashattn_bwd_atomic_add(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -255,53 +253,43 @@ def flashattn_bwd_atomic_add(batch,
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared, use_tma=True)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared, use_tma=True)
             T.copy(dv, dv_shared)
-            T.atomic_add(
-                dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
             T.copy(dk, dk_shared)
-            T.atomic_add(
-                dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split_novarlen(batch,
-                                 heads,
-                                 seq_len,
-                                 dim_qk,
-                                 dim_v,
-                                 is_causal,
-                                 block_M,
-                                 block_N,
-                                 threads=256,
-                                 num_stages=2,
-                                 groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split_novarlen(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -321,37 +309,38 @@ def flashattn_bwd_split_novarlen(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -364,16 +353,15 @@ def flashattn_bwd_split_novarlen(batch,
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -391,7 +379,10 @@ class _attention(torch.autograd.Function):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -408,17 +399,8 @@ class _attention(torch.autograd.Function):
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -429,17 +411,8 @@ class _attention(torch.autograd.Function):
             dq, dk, dv = mod_post(dq, dk, dv)
         else:
             kernel = flashattn_bwd_split_novarlen(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -447,8 +420,7 @@ class _attention(torch.autograd.Function):
             dk = torch.empty(shape_k, dtype=torch.float16, device=q.device)
             dv = torch.empty(shape_v, dtype=torch.float16, device=q.device)
             kernel(q, k, v, do, lse, delta, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         return dq, dk, dv, None, None, None
@@ -462,53 +434,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -525,7 +489,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -548,17 +512,15 @@ if __name__ == "__main__":
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -570,5 +532,4 @@ if __name__ == "__main__":
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index 88f2d81e13faa26fc924b125efb048d8df54880b..cc88b64da7a44ffc9b95f09bb8a1cd45eb681136 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -7,57 +7,44 @@ import argparse
 from einops import rearrange, repeat
 from bert_padding import pad_input, unpad_input
 
-# tilelang.disable_cache()
-
 
 def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     assert mode in ["full", "random", "third"]
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
 @tilelang.jit(
-    out_idx=[5, 6], pass_configs={
+    out_idx=[5, 6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_fwd(batch,
-                  total_q,
-                  total_kv,
-                  N_CTX,
-                  heads,
-                  max_seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn_fwd(batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     o_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(max_seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -102,15 +89,17 @@ def flashattn_fwd(batch,
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i < q_current_seqlen and
-                                                      k * block_N + j < k_current_seqlen), 0,
-                                                     T.Cast(accum_dtype, -1e30))
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= k * block_N + j)
+                            and (bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen),
+                            0,
+                            T.Cast(accum_dtype, -1e30),
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.if_then_else(
-                            bx * block_M + i < q_current_seqlen and
-                            k * block_N + j < k_current_seqlen, 0, T.Cast(accum_dtype, -1e30))
+                            bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen, 0, T.Cast(accum_dtype, -1e30)
+                        )
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, d in T.Parallel(block_N, dim_v):
                     if k * block_N + i < k_current_seqlen:
@@ -119,6 +108,8 @@ def flashattn_fwd(batch,
                         V_shared[i, d] = 0.0
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -146,21 +137,23 @@ def flashattn_fwd(batch,
 
 
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, total_q, N_CTX, max_seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [total_q, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(max_seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -199,12 +192,14 @@ def make_dq_layout(dQ):
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
@@ -212,70 +207,62 @@ def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(total_q, blk), heads, threads=128) as (bx, by):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bx * blk:(bx + 1) * blk, by, :], dQ_out[bx * blk:(bx + 1) * blk, by, :])
+            T.copy(dQ[bx * blk : (bx + 1) * blk, by, :], dQ_out[bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(total_kv, blk), head_kv, threads=128) as (bx, by):
-            T.annotate_layout({
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-            })
-            T.copy(dK[bx * blk:(bx + 1) * blk, by, :], dK_out[bx * blk:(bx + 1) * blk, by, :])
-            T.copy(dV[bx * blk:(bx + 1) * blk, by, :], dV_out[bx * blk:(bx + 1) * blk, by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bx * blk : (bx + 1) * blk, by, :], dK_out[bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bx * blk : (bx + 1) * blk, by, :], dV_out[bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             total_q,
-                             total_kv,
-                             N_CTX,
-                             heads,
-                             max_seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     do_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -301,58 +288,54 @@ def flashattn_bwd_atomic_add(batch,
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                }
+            )
 
-            T.copy(K[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   K_shared)
-            T.copy(V[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   V_shared)
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
 
-            loop_st = T.min(
-                T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen,
-                                                              block_N)) if is_causal else 0
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(
-                    Q[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    q)
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k_base * block_N:(k_base + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
 
-                T.copy(
-                    dO[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    do)
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 # dsT: (block_kv, block_q)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k_base * block_N:(k_base + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
                 T.gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow)
@@ -362,49 +345,40 @@ def flashattn_bwd_atomic_add(batch,
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 T.copy(dq, dq_shared)
                 T.atomic_add(
-                    dQ[q_start_idx + k_base * block_N:q_start_idx + k_base * block_N + block_N,
-                       bx, :],
+                    dQ[q_start_idx + k_base * block_N : q_start_idx + k_base * block_N + block_N, bx, :],
                     dq_shared,
                     memory_order="relaxed",
-                    use_tma=True)
+                    use_tma=True,
+                )
 
             T.copy(dv, dv_shared)
             T.atomic_add(
-                dV[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
+                dV[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
                 dv_shared,
                 memory_order="relaxed",
-                use_tma=True)
+                use_tma=True,
+            )
             T.copy(dk, dk_shared)
             T.atomic_add(
-                dK[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
+                dK[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
                 dk_shared,
                 memory_order="relaxed",
-                use_tma=True)
+                use_tma=True,
+            )
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        total_q,
-                        total_kv,
-                        N_CTX,
-                        heads,
-                        max_seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
@@ -412,25 +386,24 @@ def flashattn_bwd_split(batch,
     do_shape = [total_q, heads, dim_v]
     dk_shape = [groups, total_kv, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, total_kv, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -455,59 +428,55 @@ def flashattn_bwd_split(batch,
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
-            T.copy(K[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   K_shared)
-            T.copy(V[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   V_shared)
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
-            loop_st = T.min(
-                T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen,
-                                                              block_N)) if is_causal else 0
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
                 # Note: The padding zero of varlen should be considered in T.copy
-                T.copy(
-                    Q[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    q)
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
 
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(
-                    dO[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    do)
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
 
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(lse[bz, bx, k_base * block_N:(k_base + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k_base * block_N:(k_base + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -518,62 +487,37 @@ def flashattn_bwd_split(batch,
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 for i, j in T.Parallel(block_N, dim_qk):
                     if k_base * block_N + i < q_current_seqlen:
-                        T.atomic_add(
-                            dQ[q_start_idx + k_base * block_N + i, bx, j],
-                            dq[i, j],
-                            memory_order="relaxed")
+                        T.atomic_add(dQ[q_start_idx + k_base * block_N + i, bx, j], dq[i, j], memory_order="relaxed")
 
             T.copy(dv, dv_shared)
-            T.copy(
-                dv_shared,
-                dV[bx % groups, k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(
-                dk_shared,
-                dK[bx % groups, k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :])
+            T.copy(dk_shared, dK[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
-    def forward(ctx,
-                q,
-                k,
-                v,
-                seqlens_q,
-                seqlens_k,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                causal,
-                groups=1,
-                use_atomic=True):
+    def forward(
+        ctx, q, k, v, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups=1, use_atomic=True
+    ):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
         D_HEAD_V = v.shape[-1]
         block_M = 128
         block_N = 64
-        q_unpad, indices_q, _, _ = unpad_input(
-            q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
-        k_unpad, indices_k, _, _ = unpad_input(
-            k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
-        v_unpad, _, _, _ = unpad_input(
-            v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        q_unpad, indices_q, _, _ = unpad_input(q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        k_unpad, indices_k, _, _ = unpad_input(k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        v_unpad, _, _, _ = unpad_input(v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
 
         total_q = q_unpad.shape[0]
         total_kv = k_unpad.shape[0]
 
-        mod = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V,
-                            causal, block_M, block_N, groups)
+        mod = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V, causal, block_M, block_N, groups)
         o_unpad, lse = mod(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k)
         o = pad_input(o_unpad, indices_q, BATCH, N_CTX)
-        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k,
-                              cu_seqlens_q, cu_seqlens_k)
+        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k)
         ctx.batch = BATCH
         ctx.causal = causal
         ctx.use_atomic = use_atomic
@@ -588,8 +532,7 @@ class _attention(torch.autograd.Function):
         N_CTX = do.shape[1]
         q, k, v, o, lse_clone, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
         # lse_clone = lse.clone()
-        do_unpad, _, _, _ = unpad_input(
-            do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        do_unpad, _, _, _ = unpad_input(do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
         total_q, H, D_HEAD_QK = q.shape
         total_kv, HEAD_KV, D_HEAD_V = v.shape
         groups = H // HEAD_KV
@@ -622,7 +565,8 @@ class _attention(torch.autograd.Function):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.zeros_like(k, dtype=torch.float32)
             dv = torch.zeros_like(v, dtype=torch.float32)
@@ -643,13 +587,13 @@ class _attention(torch.autograd.Function):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.empty(groups, *k.shape, dtype=torch.float16, device=q.device)
             dv = torch.empty(groups, *v.shape, dtype=torch.float16, device=q.device)
             kernel(q, k, v, do, lse_clone, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         dq = pad_input(dq, ctx.indices_q, BATCH, N_CTX)
@@ -668,15 +612,13 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
     # HQ = HKV * groups
     # To handle precision issue
     Q, K, V = Q.float(), K.float(), V.float()
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if padding_mask is not None:
         scores.masked_fill_(rearrange(~padding_mask, "b s -> b 1 1 s"), float("-inf"))
@@ -684,41 +626,35 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     if padding_mask is not None:
         output.masked_fill_(rearrange(~padding_mask, "b s -> b s 1 1"), 0.0)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     padding_mask = generate_random_padding_mask(N_CTX, BATCH, "cuda", mode="random")
     seqlens_q = padding_mask.sum(dim=-1, dtype=torch.int32)
     cu_seqlens_q = F.pad(torch.cumsum(seqlens_q, dim=0, dtype=torch.int32), (1, 0))
@@ -727,8 +663,7 @@ def main(BATCH: int = 1,
     # In training backward pass, seqlens_k should be the same as seqlens_q
     seqlens_k, cu_seqlens_k, max_seqlen_k = seqlens_q, cu_seqlens_q, max_seqlen_q
 
-    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q,
-                  max_seqlen_k, causal, groups, use_atomic)
+    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
     dK, K.grad = K.grad.clone(), None
@@ -770,17 +705,15 @@ if __name__ == "__main__":
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
     # Can be set to True/False for testing
     args.causal = True
@@ -794,5 +727,4 @@ if __name__ == "__main__":
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index ed07e7d9d39b3b36f155cf3d204e226b99bf81a7..f4e2de27752cee99c77fab091b599a4de5e65928 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -6,25 +6,27 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -40,25 +42,25 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -72,29 +74,31 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -103,50 +107,42 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  threads=256,
-                  num_stages=2,
-                  groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -167,45 +163,39 @@ def flashattn_bwd(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -217,18 +207,17 @@ def flashattn_bwd(batch,
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -246,7 +235,10 @@ class _attention(torch.autograd.Function):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -260,18 +252,7 @@ class _attention(torch.autograd.Function):
         mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
         delta = mod_prep(o, do)
 
-        kernel = flashattn_bwd(
-            BATCH,
-            H,
-            N_CTX,
-            D_HEAD_QK,
-            D_HEAD_V,
-            ctx.causal,
-            block_M,
-            block_N,
-            threads=256,
-            num_stages=2,
-            groups=groups)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups)
         shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
         shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
         shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -294,52 +275,36 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False):
+def main(BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -356,7 +321,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -376,13 +341,13 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index 4d9d06a4fc7f8abff7275daef6f442e8d5a9965e..5005435eaf7cb2349f09d8900a4718e9f84f52e6 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -9,7 +9,6 @@ from functools import partial
 
 
 class FlashAttentionTuneSpace:
-
     def __init__(
         self,
         block_sizes=(64, 128, 256),
@@ -40,7 +39,7 @@ def get_configs(user_config=None):
             warp_M = block_M // warp_count
             warp_N = block_N // warp_count
 
-            if (warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0):
+            if warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0:
                 continue
 
             shared_mem = 2 * config.dtype_bytes * config.dim * (block_M + block_N)
@@ -48,36 +47,31 @@ def get_configs(user_config=None):
                 continue
 
             for num_stages in config.num_stages_range:
-                valid_configs.append({
-                    "block_M": block_M,
-                    "block_N": block_N,
-                    "num_stages": num_stages,
-                    "threads": threads,
-                })
+                valid_configs.append(
+                    {
+                        "block_M": block_M,
+                        "block_N": block_N,
+                        "num_stages": num_stages,
+                        "threads": threads,
+                    }
+                )
     return valid_configs
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              groups=1,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, groups=1, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
@@ -90,13 +84,13 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -109,22 +103,24 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -144,18 +140,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -171,25 +167,24 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -199,50 +194,34 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 16,
-         tune: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
+):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=64,
-            block_N=64,
-            num_stages=2,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -266,12 +245,12 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 1c1fc12d2ab2bca1f2dc9fcf8629a20beac0477a..7b7a71b1780ba0ea370d01f5f72379e59491f76e 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -24,9 +24,11 @@ def get_configs():
     rep=10,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -39,12 +41,12 @@ def flashattn(
     num_stages=0,
     threads=128,
 ):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
@@ -57,13 +59,13 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -76,22 +78,24 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -111,18 +115,18 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -138,30 +142,30 @@ def flashattn(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -171,23 +175,21 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -205,18 +207,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -240,12 +232,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_varlen.py b/examples/flash_attention/example_gqa_fwd_varlen.py
index db16e158649ed7d3d99630537eacd84f85cdc06f..b02345d93084843074d0924a4e945424bf104ca7 100644
--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -10,14 +10,14 @@ from varlen_utils import generate_random_padding_mask, generate_qkv
 
 
 def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),
-        upcast=True,
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    causal=False,
+    window_size=(-1, -1),
+    upcast=True,
 ):
     if causal:
         window_size = (window_size[0], 0)
@@ -26,7 +26,7 @@ def attention_ref(
         q, k, v = q.float(), k.float(), v.float()
     b, T, Hq, D = q.shape
     S = k.shape[1]
-    scale = (1.0 / D)**0.5
+    scale = (1.0 / D) ** 0.5
     k = repeat(k, "b s h d -> b s (h g) d", g=Hq // k.shape[2])
     v = repeat(v, "b s h d -> b s (h g) d", g=Hq // v.shape[2])
     scores = torch.einsum("bthd,bshd->bhts", q, k)
@@ -54,41 +54,31 @@ def attention_ref(
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              groups,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, groups, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [UQ, heads, dim]
     kv_shape = [UKV, head_kv, dim]
     o_shape = [UQ, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -102,10 +92,12 @@ def flashattn(batch_size,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                }
+            )
 
             batch_idx = bz
             head_idx = by
@@ -119,43 +111,40 @@ def flashattn(batch_size,
             q_current_seqlen = q_end_idx - q_start_idx
             kv_current_seqlen = k_end_idx - kv_start_idx
 
-            T.copy(
-                Q_unpad[q_start_idx + bx * block_M:q_start_idx + (bx + 1) * block_M, head_idx, :],
-                Q_shared)
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(q_current_seqlen +
-                              (bx + 1) * block_M, block_N), T.ceildiv(kv_current_seqlen, block_N))
-                if is_causal else T.ceildiv(kv_current_seqlen, block_N))
+                T.min(T.ceildiv(q_current_seqlen + (bx + 1) * block_M, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                if is_causal
+                else T.ceildiv(kv_current_seqlen, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(
-                    K_unpad[kv_start_idx + k * block_N:kv_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], K_shared)
+                T.copy(K_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], K_shared)
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i,
-                              j] = T.if_then_else((bx * block_M + i < k * block_N + j) or
-                                                  (bx * block_M + i >= q_current_seqlen or
-                                                   k * block_N + j >= kv_current_seqlen), -1e9, 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i < k * block_N + j)
+                            or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen),
+                            -1e9,
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= kv_current_seqlen), -1e9,
-                                                     0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen), -1e9, 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-
                 for i in T.Parallel(block_M):
                     scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
@@ -171,9 +160,7 @@ def flashattn(batch_size,
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] *= scores_scale[i]
 
-                T.copy(
-                    V_unpad[kv_start_idx + k * block_N:kv_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], V_shared)
+                T.copy(V_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], V_shared)
 
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
@@ -188,13 +175,9 @@ def flashattn(batch_size,
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         q_seqlen: int = 2048,
-         k_seqlen: int = 2048,
-         dim: int = 128,
-         groups: int = 16,
-         is_causal: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, q_seqlen: int = 2048, k_seqlen: int = 2048, dim: int = 128, groups: int = 16, is_causal: bool = False
+):
     assert heads % groups == 0, "heads must be divisible by groups"
 
     flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
@@ -232,24 +215,12 @@ def main(batch: int = 1,
         output_pad_fn,
         _,
         _,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]
     UKV = k_unpad.shape[0]
 
-    kernel = flashattn(
-        batch,
-        groups,
-        UQ,
-        UKV,
-        heads,
-        dim,
-        is_causal,
-        block_M=128,
-        block_N=128,
-        num_stages=2,
-        threads=256)
+    kernel = flashattn(batch, groups, UQ, UKV, heads, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
 
     out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
     out = output_pad_fn(out_unpad)
@@ -264,23 +235,19 @@ def main(batch: int = 1,
     )
     torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
     print("All checks passed.✅")
-    latency = do_bench(
-        lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q),
-        _n_warmup=5,
-        _n_repeat=5)
+    latency = do_bench(lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q), _n_warmup=5, _n_repeat=5)
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='query heads')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument('--q_seqlen', type=int, default=2048, help='query sequence length')
-    parser.add_argument('--k_seqlen', type=int, default=2048, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='head dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal attention')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups,
-         args.is_causal)
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal)
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index 1595ae7646c5e1d17d559fc95c98c53c22691932..835a315965db00752f4096a60a2de4a4db10bf68 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -7,22 +7,24 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -39,28 +41,28 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             # T.copy(Q_shared, Q_local)
             # for i, j in T.Parallel(block_M, dim):
             #     Q_local[i, j] *= scale
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -74,29 +76,31 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -105,68 +109,71 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -190,36 +197,39 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -232,14 +242,13 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     T.atomic_add(dQ[bz, bx, k * block_N + i, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, H, N_CTX, D_HEAD = q.shape
@@ -281,15 +290,15 @@ attention = _attention.apply
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(2)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -304,9 +313,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -347,10 +354,10 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd.py b/examples/flash_attention/example_mha_bwd_bshd.py
similarity index 69%
rename from examples/flash_attention/example_mha_bwd.py
rename to examples/flash_attention/example_mha_bwd_bshd.py
index 543c2c0e7570a1c2d02b689edda90da4aa378937..c0620bde0e95480d907fe94d9909ee3c30348860 100644
--- a/examples/flash_attention/example_mha_bwd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -7,22 +7,24 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,25 +40,25 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -70,29 +72,31 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -101,68 +105,71 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -186,33 +193,36 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-            })
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -225,14 +235,13 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -274,15 +283,15 @@ attention = _attention.apply
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -297,9 +306,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -338,10 +345,10 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
similarity index 67%
rename from examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
rename to examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index 7ad417ef55782855087d7909db29a8f0dfffafeb..34a8d69ce475f7126e2635ae91f9a71159aa0d91 100644
--- a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -7,22 +7,24 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,26 +40,26 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -71,29 +73,31 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -102,37 +106,39 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=256) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -157,47 +163,43 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             dk_shared = T.alloc_shared([block_M, dim], dtype)
             dq_shared = T.alloc_shared([block_N, dim], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                    dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
                 T.wait_wgmma(0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -208,17 +210,16 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -260,15 +261,15 @@ attention = _attention.apply
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -283,9 +284,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -305,7 +304,7 @@ def main(
     assert torch.allclose(dV, dV_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dK, dK_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -323,10 +322,10 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd.py b/examples/flash_attention/example_mha_fwd_bhsd.py
index f07f7a618cef78dcb586934500f67d29cb1bf292..e70d17bf8c9adc9b12d90111e3fe8906cccc5ba0 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd.py
@@ -15,24 +15,17 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -48,14 +41,16 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
                 q_idx = bx * block_M + i + past_len
                 k_idx = k * block_N + j
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -68,22 +63,26 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -104,18 +103,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -131,43 +130,42 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -185,18 +183,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=64,
-            block_N=64,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -221,12 +209,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=1, help='heads')
-    parser.add_argument('--seq_q', type=int, default=256, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=256, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=1, help="heads")
+    parser.add_argument("--seq_q", type=int, default=256, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=256, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal", default=False)
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index 26167b34b7d596e10b958ac0d50293f0224a8c79..b8c4d81ece8607c4499798d58421c067dc60b518 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -15,24 +15,17 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -48,14 +41,16 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
                 q_idx = bx * block_M + i + past_len
                 k_idx = k * block_N + j
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -68,22 +63,24 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -104,18 +101,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -131,48 +128,48 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -190,18 +187,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -226,12 +213,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index 6a1f707e57148f6cadda63bdf2857b94b8a7d00e..248073f797d7b41d6b223f6de25c02f5b486de5b 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -15,22 +15,16 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
@@ -43,13 +37,14 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -62,22 +57,24 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -97,18 +94,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -124,40 +121,39 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -174,17 +170,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -208,11 +195,11 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index 3928db4c3b3ec3c47fa7a6a6f4cbacb27c7160a6..ab2aab44f496f7ae422d17f5d2a1baf0c057116e 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -15,22 +15,16 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
@@ -43,13 +37,14 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -62,22 +57,24 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -97,18 +94,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -124,45 +121,45 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -179,17 +176,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -213,11 +201,11 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index f381e900aff61436b12365491f8cb06e5f37d74c..6ba2e8ab472d371cedcf1fe2d7203e25c85ec099 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -11,14 +11,14 @@ from varlen_utils import generate_random_padding_mask, generate_qkv
 
 
 def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),  # -1 means infinite window size
-        upcast=True,
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    upcast=True,
 ):
     """
     Arguments:
@@ -47,7 +47,7 @@ def attention_ref(
     if upcast:
         q, k, v = q.float(), k.float(), v.float()
     dim = q.shape[-1]
-    scale = (1.0 / dim)**0.5  # log2(e)
+    scale = (1.0 / dim) ** 0.5  # log2(e)
     k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
     v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
     scores = torch.einsum("bthd,bshd->bhts", q, k)
@@ -68,41 +68,32 @@ def attention_ref(
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=32):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=32):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [UQ, heads, dim]
     k_shape = [UKV, heads, dim]
     v_shape = [UKV, heads, dim]
     o_shape = [UQ, heads, dim]
 
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(k_shape, dtype),
-            V_unpad: T.Tensor(v_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(k_shape, dtype),
+        V_unpad: T.Tensor(v_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype, "shared")
             K_shared = T.alloc_shared([block_N, dim], dtype, "shared")
             V_shared = T.alloc_shared([block_N, dim], dtype, "shared")
@@ -151,15 +142,17 @@ def flashattn(batch_size,
                         K_shared[i, d] = 0
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= k * block_N + j)
+                            and (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen),
+                            -T.infinity(acc_s.dtype),
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen), -T.infinity(acc_s.dtype), 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -167,6 +160,8 @@ def flashattn(batch_size,
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -242,8 +237,7 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
         output_pad_fn,
         dq_pad_fn,
         dk_pad_fn,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]  # unpadded query length
     UK = k_unpad.shape[0]  # unpadded key length
@@ -285,10 +279,10 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=2048, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=2048, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
 
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim)
diff --git a/examples/flash_attention/test_example_flash_attention.py b/examples/flash_attention/test_example_flash_attention.py
index f4932aee9a696058957f141fae8dacdaf10fdbf6..da172bb62a4dee0ada293fc1249812905ded8de1 100644
--- a/examples/flash_attention/test_example_flash_attention.py
+++ b/examples/flash_attention/test_example_flash_attention.py
@@ -2,7 +2,7 @@ import tilelang.testing
 
 import example_gqa_bwd
 import example_gqa_bwd_wgmma_pipelined
-import example_mha_bwd
+import example_mha_bwd_bshd
 import example_mha_bwd_bhsd
 import example_mha_fwd_bhsd_wgmma_pipelined
 import example_gqa_fwd_bshd
@@ -10,7 +10,7 @@ import example_mha_fwd_bshd
 import example_gqa_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_varlen
-import example_mha_bwd_wgmma_pipelined
+import example_mha_bwd_bshd_wgmma_pipelined
 import example_mha_fwd_bhsd
 import example_gqa_bwd_tma_reduce_varlen
 
@@ -33,7 +33,7 @@ def test_example_gqa_bwd_wgmma_pipelined():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd():
-    example_mha_bwd.main(
+    example_mha_bwd_bshd.main(
         BATCH=1,
         H=16,
         N_CTX=512,
@@ -56,20 +56,18 @@ def test_example_mha_bwd_bhsd():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_bwd_wgmma_pipelined():
-    example_mha_bwd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
+    example_mha_bwd_bshd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_gqa_fwd_bshd_wgmma_pipelined():
-    example_gqa_fwd_bshd_wgmma_pipelined.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd_wgmma_pipelined.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
 def test_example_gqa_fwd_bshd():
-    example_gqa_fwd_bshd.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
diff --git a/examples/flash_attention/varlen_utils.py b/examples/flash_attention/varlen_utils.py
index 4301215d554b23f18c241391a58886e452d7531a..43e21cc3b80ce72eaa582407024ec2c42015731e 100644
--- a/examples/flash_attention/varlen_utils.py
+++ b/examples/flash_attention/varlen_utils.py
@@ -9,22 +9,14 @@ def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
-def generate_qkv(q,
-                 k,
-                 v,
-                 query_padding_mask=None,
-                 key_padding_mask=None,
-                 kvpacked=False,
-                 qkvpacked=False):
+def generate_qkv(q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False):
     """
     Arguments:
         q: (batch_size, seqlen_q, nheads, d)
@@ -39,15 +31,12 @@ def generate_qkv(q,
 
     if query_padding_mask is not None:
         q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
-        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q
-                                                      )
+        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q)
     else:
         q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
         max_seqlen_q = seqlen_q
-        output_pad_fn = lambda output_unpad: rearrange(
-            output_unpad, "(b s) h d -> b s h d", b=batch_size)
+        output_pad_fn = lambda output_unpad: rearrange(output_unpad, "(b s) h d -> b s h d", b=batch_size)
 
     if key_padding_mask is not None:
         k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
@@ -55,8 +44,7 @@ def generate_qkv(q,
     else:
         k_unpad = rearrange(k, "b s h d -> (b s) h d")
         v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
+        cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
         max_seqlen_k = seqlen_k
 
     if qkvpacked:
@@ -67,8 +55,7 @@ def generate_qkv(q,
         if query_padding_mask is not None:
             dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
         else:
-            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
-                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             qkv_unpad.detach().requires_grad_(),
             cu_seqlens_q,
@@ -84,8 +71,7 @@ def generate_qkv(q,
         if key_padding_mask is not None:
             dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
         else:
-            dkv_pad_fn = lambda dkv_unpad: rearrange(
-                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dkv_pad_fn = lambda dkv_unpad: rearrange(dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             q_unpad.detach().requires_grad_(),
             kv_unpad.detach().requires_grad_(),
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 9ec3a026513ab0fc919258ca9c31f3db36b939a1..ee42df2080f3f3ed468b522e9df0db25377144de 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -15,18 +15,12 @@ torch.random.manual_seed(0)
 def get_configs():
     block_N = [64, 128]
     block_H = [64]
-    num_split = [2, 4, 8]
+    num_split = [1, 2, 4, 8]
     num_stages = [1, 2, 3]
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
@@ -42,29 +36,25 @@ def get_heuristic_config() -> Tuple[Dict, int]:
     if sm_version == 89:
         cfg = dict(block_N=128, block_H=64, num_split=1, num_stages=0, threads=128)
     else:
-        cfg = dict(block_N=128, block_H=64, num_split=1, num_stages=2, threads=128)
+        cfg = dict(block_N=128, block_H=64, num_split=8, num_stages=2, threads=128)
     return cfg, sm_version
 
 
 # TODO(lei): fix warp specialized and tma lower pass
 def get_pass_configs():
-    return {
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    }
+    return {tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[6], pass_configs=get_pass_configs())
-def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages,
-              threads):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages, threads):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [batch, seqlen_kv, groups, dim]
     shape_v = [batch, seqlen_kv, groups, dim]
     shape_o = [batch, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // groups
 
     part_shape = [batch, heads, num_split, dim]
@@ -73,11 +63,11 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -98,23 +88,24 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
             hid = by
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(K[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_shared)
-                T.copy(mask[bid, k * block_N:(k + 1) * block_N, cur_kv_head], mask_local)
+                T.copy(K[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_shared)
+                T.copy(mask[bid, k * block_N : (k + 1) * block_N, cur_kv_head], mask_local)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -125,23 +116,23 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(V[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], V_shared)
+                T.copy(V[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -163,7 +154,7 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
             sid = bz
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -172,22 +163,31 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    K[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], K_shared)
+                    K[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    K_shared,
+                )
                 T.copy(
-                    mask[bid, (seqlen_kv // num_split) * sid +
-                         k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                         cur_kv_head], mask_local)
+                    mask[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                    ],
+                    mask_local,
+                )
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i,
-                          j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split),
-                                              acc_s[i, j], -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split), acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -199,9 +199,14 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
                 T.copy(
-                    V[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], V_shared)
+                    V[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    V_shared,
+                )
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
@@ -212,72 +217,74 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 if i < valid_block_H:
                     glse[bid, hid * valid_block_H + i, sid] = logsum[i]
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * valid_block_H:(hid + 1) * valid_block_H,
-                                            sid, :])
+            T.copy(O_shared, Output_partial[bid, hid * valid_block_H : (hid + 1) * valid_block_H, sid, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (by, bz):
             po_local = T.alloc_fragment([dim], dtype)
             o_accum_local = T.alloc_fragment([dim], accum_dtype)
             lse_local = T.alloc_fragment([num_split, 128], dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
+            lse_logsum_local = T.alloc_fragment([128], accum_dtype)
             lse_max_local = T.alloc_fragment([128], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
+            scale_local = T.alloc_fragment([128], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                lse_max_local: T.Fragment(lse_max_local.shape, forward_thread_fn=lambda i: i),
-                # lse_local: (local_id, thread_id)
-                lse_local: T.Fragment(lse_local.shape, forward_fn=lambda i, j: (j, i)),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    lse_max_local: T.Fragment(lse_max_local.shape, forward_thread_fn=lambda i: i),
+                    # lse_local: (local_id, thread_id)
+                    lse_local: T.Fragment(lse_local.shape, forward_fn=lambda i, j: (j, i)),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
             for k, j in T.Parallel(num_split, 128):
                 lse_local[k, j] = glse[bz, by, k]
             T.reduce_max(lse_local, lse_max_local, dim=0, clear=True)
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
+            for k in T.serial(num_split):
+                for j in T.Parallel(128):
+                    lse_logsum_local[j] += T.exp2(lse_local[k, j] - lse_max_local[j])
+            for j in T.Parallel(128):
+                lse_logsum_local[j] = T.log2(lse_logsum_local[j]) + lse_max_local[j]
             for k in T.serial(num_split):
                 for i in T.Parallel(dim):
                     po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                for j in T.Parallel(128):
+                    scale_local[j] = T.exp2(lse_local[k, j] - lse_logsum_local[j])
+                # Note: Pay attention to dim and the number of threads in Parallel
                 for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
+                    o_accum_local[i] += po_local[i] * scale_local[i]
             for i in T.Parallel(dim):
                 Output[bz, by, i] = o_accum_local[i]
 
     @T.prim_func
     def flashattn_gqa_decode_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         flash_attn_split(Q, K, V, mask, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         flash_attn(Q, K, V, mask, Output)
 
@@ -300,27 +307,21 @@ def ref_program(query, key, value, mask, glse, Output_partial):
     dim = query.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
     if mask is not None:
-        mask = rearrange(mask, 'b s h -> b h s')
+        mask = rearrange(mask, "b s h -> b h s")
         mask = mask.unsqueeze(1)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -334,16 +335,12 @@ def flash_split_ref(Q, K, V, mask):
     seqlen_kv = K.size(1)
     num_head_groups = nheads // groups
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float)
-    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N),
-                             device="cuda",
-                             dtype=torch.float16)
+    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, num_head_groups, groups, dim), device="cuda", dtype=torch.float)
     scores_max = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
-    scores_max_prev = torch.empty((batch, num_head_groups, groups),
-                                  device="cuda",
-                                  dtype=torch.float)
+    scores_max_prev = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_scale = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_sum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     logsum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
@@ -351,25 +348,25 @@ def flash_split_ref(Q, K, V, mask):
     glogsum = torch.empty((num_split, batch, nheads), device="cuda", dtype=torch.float)
 
     Q_ = Q * scale
-    Q_ = rearrange(Q_, 'b (h g) d -> b g h d', g=num_head_groups)
+    Q_ = rearrange(Q_, "b (h g) d -> b g h d", g=num_head_groups)
 
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bghd,bkhd->bghk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bghd,bkhd->bghk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             if mask is not None:
-                mask_local = mask[:, (seqlen_kv // num_split) * ks +
-                                  i * block_N:(seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
-                mask_local = rearrange(mask_local, 'b s h -> b h s')
+                mask_local = mask[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
+                mask_local = rearrange(mask_local, "b s h -> b h s")
                 mask_local = mask_local.unsqueeze(1)
-                acc_s = acc_s.masked_fill(mask_local == 0, float('-inf'))
+                acc_s = acc_s.masked_fill(mask_local == 0, float("-inf"))
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -377,15 +374,16 @@ def flash_split_ref(Q, K, V, mask):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bghk,bkhd->bghd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bghk,bkhd->bghd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
-        acc_o_out = rearrange(acc_o, 'b g h d->b (h g) d')
-        logsum_out = rearrange(logsum, 'b g h->b (h g)')
+        acc_o_out = rearrange(acc_o, "b g h d->b (h g) d")
+        logsum_out = rearrange(logsum, "b g h->b (h g)")
         acc_o_out /= logsum_out[:, :, None]
-        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, 'b g h->b (h g)')
+        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, "b g h->b (h g)")
         gacc_o[ks, :, :, :] = acc_o_out
         glogsum[ks, :, :] = logsum_out
 
@@ -421,7 +419,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -429,28 +427,23 @@ def calc_sim(x, y, name="tensor"):
 
 def assert_similar(x, y, eps=1e-2, name="tensor", assert_=False, print_=True):
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if assert_:
-            raise AssertionError(f'{name} Error: {diff}')
+            raise AssertionError(f"{name} Error: {diff}")
     else:
         if print_:
-            print(f'passed: {name} diff={diff}')
+            print(f"passed: {name} diff={diff}")
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         groups: int = 8,
-         kv_seqlen: int = 8192,
-         dim: int = 128,
-         tune: bool = False):
+def main(batch: int = 1, heads: int = 32, groups: int = 8, kv_seqlen: int = 8192, dim: int = 128, tune: bool = False):
     batch, heads, groups, kv_seqlen, dim = batch, heads, groups, kv_seqlen, dim
     qk_flops = 2 * batch * heads * kv_seqlen * dim
     pv_flops = 2 * batch * heads * kv_seqlen * dim
     total_flops = qk_flops + pv_flops
 
-    if (not tune):
+    if not tune:
         config, sm_version = get_heuristic_config()
         kernel = flashattn(batch, heads, groups, kv_seqlen, dim, **config)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
@@ -470,7 +463,7 @@ def main(batch: int = 1,
         print(o_ref)
 
         assert_similar(o, o_ref, name="o_ref")
-        assert_similar(o_ref_split, o_ref, name="o_ref_split")
+        assert_similar(o, o_ref_split, name="o_ref_split")
 
         print("All checks pass.")
         latency = profiler.do_bench(ref_program, warmup=500)
@@ -492,11 +485,11 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument('--kv_seqlen', type=int, default=8192, help='kv sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--kv_seqlen", type=int, default=8192, help="kv sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.groups, args.kv_seqlen, args.dim, args.tune)
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits.py b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
index 16924ebe89bc1bfe24552997cb9053eae93e6fcd..ef3d8baed6fce23f474597d29aeb365e611e629e 100644
--- a/examples/flash_decoding/example_gqa_decode_varlen_logits.py
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
@@ -19,8 +19,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                                           head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
@@ -74,14 +73,9 @@ def _fwd_inner(
     return m_i, l_i, acc
 
 
-
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [4, 8]\
-        for num_stages in [2, 4]\
-    ],
-    key=['gqa_group_size', 'BLOCK_N', 'BLOCK_D', 'BLOCK_H'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [4, 8] for num_stages in [2, 4]],
+    key=["gqa_group_size", "BLOCK_N", "BLOCK_D", "BLOCK_H"],
 )
 @triton.jit
 def _fwd_kernel_varlen(
@@ -107,13 +101,12 @@ def _fwd_kernel_varlen(
     stride_od,
     stride_sb,
     stride_sh,
-    stride_sn,  #bmask shape [b, q_h, seq/BLOCK_N]
+    stride_sn,  # bmask shape [b, q_h, seq/BLOCK_N]
     gqa_group_size: tl.constexpr,
     BLOCK_H: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_D: tl.constexpr,
 ):
-
     off_z = tl.program_id(0)
     off_h_for_kv = tl.program_id(1)
     off_h_q = off_h_for_kv * gqa_group_size
@@ -134,8 +127,7 @@ def _fwd_kernel_varlen(
     S_ptrs = S + off_z * stride_sb + off_h_q * stride_sh
 
     mask_h = offs_h < gqa_group_size
-    q = tl.load(
-        Q_ptrs + offs_d[None, :] * stride_qd + offs_h[:, None] * stride_qh, mask=mask_h[:, None])
+    q = tl.load(Q_ptrs + offs_d[None, :] * stride_qd + offs_h[:, None] * stride_qh, mask=mask_h[:, None])
 
     if s_aux is not None:
         sink = tl.load(s_aux + off_h_q + offs_h, mask=mask_h).to(tl.float32)
@@ -189,14 +181,12 @@ def _fwd_kernel_varlen(
 
     acc = acc.to(O.dtype.element_ty)
 
-    tl.store(
-        O_ptrs + offs_h[:, None] * stride_oh + offs_d[None, :] * stride_od,
-        acc,
-        mask=mask_h[:, None])
+    tl.store(O_ptrs + offs_h[:, None] * stride_oh + offs_d[None, :] * stride_od, acc, mask=mask_h[:, None])
 
 
 def get_configs():
     import itertools
+
     block_N = [64, 128]
     block_H = [64]
     num_split = [1]
@@ -204,38 +194,23 @@ def get_configs():
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[-2, -1], debug_root_path="./examples/flash_decoding")
-def flashattn(batch,
-              heads,
-              k_heads,
-              max_seqlen_kv,
-              total_seqlen_k,
-              dim,
-              has_sink,
-              block_N=128,
-              block_H=64,
-              num_split=1,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(
+    batch, heads, k_heads, max_seqlen_kv, total_seqlen_k, dim, has_sink, block_N=128, block_H=64, num_split=1, num_stages=1, threads=128
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [total_seqlen_k, k_heads, dim]
     shape_v = [total_seqlen_k, k_heads, dim]
     shape_o = [batch, heads, dim]
     shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // k_heads
 
     valid_block_H = min(block_H, kv_group_num)
@@ -243,13 +218,13 @@ def flashattn(batch,
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            Output: T.Tensor([batch, heads, dim], dtype),
-            S: T.Tensor(shape_s, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        Output: T.Tensor([batch, heads, dim], dtype),
+        S: T.Tensor(shape_s, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -266,15 +241,17 @@ def flashattn(batch,
             logsum = T.alloc_fragment([block_H], accum_dtype)
             S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
             # S_fragment = T.alloc_fragment([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
-            s_aux_shared = T.alloc_shared([block_H], "float32")
-
-            T.annotate_layout({
-                # Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                # K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                # V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                # S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            s_aux_shared = T.alloc_shared([block_H], T.float32)
+
+            T.annotate_layout(
+                {
+                    # Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                    # K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    # V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    # S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
             bid = bx
             hid = by
@@ -284,7 +261,7 @@ def flashattn(batch,
             cur_end_k = cu_seqlens_k[bid + 1]
             cur_seqlen_k = cur_end_k - cur_start_k
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -292,15 +269,13 @@ def flashattn(batch,
             # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(K[cur_start_k + k * block_N:cur_start_k + (k + 1) * block_N, cur_kv_head, :],
-                       K_shared)
+                T.copy(K[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], K_shared)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
                     # acc_s[i, j] = T.if_then_else(mask_local[j] != 0 and k * block_N + j < cur_seqlen_k, acc_s[i, j],
                     #                              -T.infinity(accum_dtype))
-                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -320,12 +295,11 @@ def flashattn(batch,
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(V[cur_start_k + k * block_N:cur_start_k + (k + 1) * block_N, cur_kv_head, :],
-                       V_shared)
+                T.copy(V[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             if has_sink:
-                T.copy(s_aux[hid * valid_block_H:hid * valid_block_H + block_H], s_aux_shared)
+                T.copy(s_aux[hid * valid_block_H : hid * valid_block_H + block_H], s_aux_shared)
                 for i in T.Parallel(block_H):
                     logsum[i] += s_aux_shared[i]
             for i, j in T.Parallel(block_H, dim):
@@ -338,20 +312,19 @@ def flashattn(batch,
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
             # T.copy(S_fragment, S_shared)
-            T.copy(S_shared[:valid_block_H, :], S[bid,
-                                                  hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(S_shared[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
     @T.prim_func
     def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            Output: T.Tensor(shape_o, dtype),
-            S: T.Tensor(shape_s, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        Output: T.Tensor(shape_o, dtype),
+        S: T.Tensor(shape_s, dtype),
     ):
         flash_attn(Q, K, V, cu_seqlens_k, s_aux, Output, S)
 
@@ -388,9 +361,7 @@ def flash_attn_with_attn_pool_decode_tilelang(
     gqa_group_size = q_h // k_h
 
     O_tl = torch.zeros_like(Q)
-    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)),
-                       dtype=Q.dtype,
-                       device=Q.device)
+    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)), dtype=Q.dtype, device=Q.device)
     O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux)
 
     if use_per_kv_head_sparse_index:
@@ -433,9 +404,7 @@ def flash_attn_with_attn_pool_decode(
     BLOCK_H = 64
 
     O = torch.zeros_like(Q)
-    S = torch.zeros((batch, q_h, math.ceil(max_seqlen_k / block_size)),
-                    dtype=Q.dtype,
-                    device=Q.device)
+    S = torch.zeros((batch, q_h, math.ceil(max_seqlen_k / block_size)), dtype=Q.dtype, device=Q.device)
 
     def grid(META):
         return (batch, k_h)
@@ -480,18 +449,18 @@ def test_equal_seqlen_decode_main(args):
     real_max_k_seqlen = args.k_seqlen
     head_size = args.head_size
     block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     # For decode, query is just 1 token per batch
-    q = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
-    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
+    q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
+    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
     softmax_scale = 1.0 / math.sqrt(head_size)
 
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print(f"Using sink attention with sink values: {sink}")
 
     # Convert to varlen format for K, V
@@ -499,8 +468,7 @@ def test_equal_seqlen_decode_main(args):
     v_varlen = v.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size)
 
     # Generate cumulative sequence lengths
-    cu_seqlens_k = torch.arange(
-        0, (batch_size + 1) * k_seqlen, k_seqlen, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.arange(0, (batch_size + 1) * k_seqlen, k_seqlen, device="cuda", dtype=torch.int32)
     max_seqlen_k = k_seqlen
 
     print(f"q shape: {q.shape}")
@@ -510,8 +478,7 @@ def test_equal_seqlen_decode_main(args):
     num_tokens, q_h, head_size = q.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
 
     # Test our decode kernel
     O_triton, S_triton = flash_attn_with_attn_pool_decode(
@@ -524,7 +491,8 @@ def test_equal_seqlen_decode_main(args):
         args.num_split,
         softmax_scale,
         s_aux=sink,
-        block_size=block_size)
+        block_size=block_size,
+    )
     O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
         q,
         k_varlen,
@@ -539,9 +507,7 @@ def test_equal_seqlen_decode_main(args):
         tl_kernel=tl_kernel,
     )
     for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
 
     # Compute torch reference
     q_expanded = q.unsqueeze(2)  # [b, q_heads, 1, head_size]
@@ -550,14 +516,12 @@ def test_equal_seqlen_decode_main(args):
 
     if sink is None:
         # Standard scaled dot-product attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
         attn_weights = torch.softmax(logits, dim=-1)
         O_torch = torch.matmul(attn_weights, v_repeat).squeeze(2)  # [batch, q_heads, head_size]
     else:
         # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
 
         sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
         logits_max = torch.max(logits, dim=-1, keepdim=True).values
@@ -566,15 +530,15 @@ def test_equal_seqlen_decode_main(args):
         unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
         normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
         attn_weights = unnormalized_scores / normalizer
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat).squeeze(2)  # [batch, q_heads, head_size]
 
     # Compute attention score pooling
     attn_score_pooled = torch.max_pool2d(
         attn_weights.squeeze(2),  # [b, q_heads, k_seqlen]
         kernel_size=(q_heads, block_size),
         stride=(q_heads, block_size),
-        ceil_mode=True).to(torch.float16)
+        ceil_mode=True,
+    ).to(torch.float16)
 
     print("S_tilelang", S_tilelang)
     print("attn_score_pooled", attn_score_pooled)
@@ -588,15 +552,10 @@ def test_equal_seqlen_decode_main(args):
     print(f"Max difference in S: {max_diff_s.item()}")
     print(f"Max difference in O_tilelang: {max_diff_o_tilelang.item()}")
     print(f"Max difference in S_tilelang: {max_diff_s_tilelang.item()}")
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
-    assert torch.allclose(
-        S_tilelang, attn_score_pooled, atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
+    assert torch.allclose(S_tilelang, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
     print("✅ All tests passed!")
 
 
@@ -609,14 +568,14 @@ def test_varlen_decode_main(args):
     real_max_k_seqlen = args.k_seqlen
     head_size = args.head_size
     block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
 
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print(f"Using sink attention with sink values: {sink}")
 
     # Generate variable length k sequences
@@ -624,7 +583,7 @@ def test_varlen_decode_main(args):
     print(f"k_seqlens: {k_seqlens}")
 
     # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
     total_k_tokens = 0
     for i in range(batch_size):
         cu_seqlens_k[i] = total_k_tokens
@@ -634,9 +593,9 @@ def test_varlen_decode_main(args):
     print(f"cu_seqlens_k: {cu_seqlens_k}")
 
     # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
 
     softmax_scale = 1.0 / math.sqrt(head_size)
     max_seqlen_k = int(k_seqlens.max())
@@ -649,8 +608,7 @@ def test_varlen_decode_main(args):
     num_tokens, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
 
     # Test our decode kernel
     O_triton, S_triton = flash_attn_with_attn_pool_decode(
@@ -663,7 +621,8 @@ def test_varlen_decode_main(args):
         args.num_split,
         softmax_scale,
         s_aux=sink,
-        block_size=block_size)
+        block_size=block_size,
+    )
     O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
         q_decode,
         k_varlen,
@@ -678,9 +637,7 @@ def test_varlen_decode_main(args):
         tl_kernel=tl_kernel,
     )
     for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
 
     # Create torch reference - pad tensors for comparison
     k_padded_list = []
@@ -694,8 +651,8 @@ def test_varlen_decode_main(args):
         k_end = cu_seqlens_k[i + 1]
 
         # Pad to max_seqlen_k
-        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
-        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
+        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
 
         k_padded[:actual_k_len] = k_varlen[k_start:k_end]
         v_padded[:actual_k_len] = v_varlen[k_start:k_end]
@@ -704,10 +661,8 @@ def test_varlen_decode_main(args):
         v_padded_list.append(v_padded)
 
     # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
-    k_padded_batched = torch.stack(
-        k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
-    v_padded_batched = torch.stack(
-        v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    k_padded_batched = torch.stack(k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    v_padded_batched = torch.stack(v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
 
     # Expand q to match kv heads: [b, q_heads, 1, head_size]
     q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
@@ -717,20 +672,17 @@ def test_varlen_decode_main(args):
     print(f"v_padded_batched shape: {v_padded_batched.shape}")
 
     # Compute torch reference
-    k_repeat = repeat_kv(k_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
-    v_repeat = repeat_kv(v_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    k_repeat = repeat_kv(k_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    v_repeat = repeat_kv(v_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
 
     if sink is None:
         # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
-        attn_score = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+        attn_score = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
         # Apply sequence length masking
         for i in range(batch_size):
             actual_k_len = k_seqlens[i]
-            attn_score[i, :, :, actual_k_len:] = float('-inf')
+            attn_score[i, :, :, actual_k_len:] = float("-inf")
 
         attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
 
@@ -743,13 +695,12 @@ def test_varlen_decode_main(args):
         O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
     else:
         # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
         # Apply sequence length masking
         for i in range(batch_size):
             actual_k_len = k_seqlens[i]
-            logits[i, :, :, actual_k_len:] = float('-inf')
+            logits[i, :, :, actual_k_len:] = float("-inf")
 
         sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
         logits_max = torch.max(logits, dim=-1, keepdim=True).values
@@ -765,8 +716,7 @@ def test_varlen_decode_main(args):
             attn_weights[i, :, :, actual_k_len:] = 0.0
 
         # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat)  # [b, q_heads, 1, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat)  # [b, q_heads, 1, head_size]
 
     O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
 
@@ -775,7 +725,8 @@ def test_varlen_decode_main(args):
         attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
         kernel_size=(q_heads, block_size),
         stride=(q_heads, block_size),
-        ceil_mode=True).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
+        ceil_mode=True,
+    ).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
 
     print(f"O_triton shape: {O_triton.shape}")
     print(f"O_tilelang shape: {O_tilelang.shape}")
@@ -791,22 +742,16 @@ def test_varlen_decode_main(args):
     print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
 
     max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
-    max_diff_s_tl = torch.max(
-        torch.abs(S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
+    max_diff_s_tl = torch.max(torch.abs(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
     print(f"Max difference in S: {max_diff_s.item()}")
     print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
 
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
-    assert torch.allclose(
-        S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)],
-        attn_score_pooled,
-        atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tl.item()}"
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
+    assert torch.allclose(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)], attn_score_pooled, atol=1e-2, rtol=1e-2), (
+        f"Score mismatch: {max_diff_s_tl.item()}"
+    )
 
     print("✅ All tests passed!")
 
@@ -844,7 +789,7 @@ def speed_benchmark_decode_comparison(args):
     max_k_seqlen = args.k_seqlen
     head_size = args.head_size
     block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     print("\n=== Decode Speed Benchmark Comparison ===")
     print("Configuration:")
@@ -865,7 +810,7 @@ def speed_benchmark_decode_comparison(args):
         k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
 
     # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
     total_k_tokens = 0
     for i in range(batch_size):
         cu_seqlens_k[i] = total_k_tokens
@@ -873,9 +818,9 @@ def speed_benchmark_decode_comparison(args):
     cu_seqlens_k[batch_size] = total_k_tokens
 
     # Generate tensors
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
 
     softmax_scale = 1.0 / math.sqrt(head_size)
     max_seqlen_k = int(k_seqlens.max())
@@ -883,7 +828,7 @@ def speed_benchmark_decode_comparison(args):
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print("  Using sink attention with sink values")
 
     print("Setup complete:")
@@ -896,8 +841,7 @@ def speed_benchmark_decode_comparison(args):
     num_tokens, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
 
     # Benchmark
     print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
@@ -920,36 +864,41 @@ def speed_benchmark_decode_comparison(args):
 
     # Benchmark
     print("⚡ Benchmarking Triton kernel (100 iterations)...")
-    triton_time = do_bench(flash_attn_with_attn_pool_decode, q_decode, k_varlen, v_varlen,
-                           cu_seqlens_k, max_seqlen_k, args.k_seqlen, 1, softmax_scale, sink,
-                           block_size)
+    triton_time = do_bench(
+        flash_attn_with_attn_pool_decode,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+    )
     print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
 
     print(f"Speedup: {(triton_time / tilelang_time):.3f}")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Flash Attention Decode with Attention Pooling')
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch size')
-    parser.add_argument('--q_heads', type=int, default=32, help='Number of query heads')
-    parser.add_argument('--kv_heads', type=int, default=8, help='Number of key-value heads')
-    parser.add_argument('--k_seqlen', type=int, default=8192, help='Key sequence length')
-    parser.add_argument(
-        '--head_size', type=int, default=128, choices=[64, 128, 256], help='Head dimension')
-    parser.add_argument('--block_size', type=int, default=64, help='Block size for computation')
-    parser.add_argument(
-        '--dtype', type=str, default='bfloat16', choices=['float16', 'bfloat16'], help='Data type')
-    parser.add_argument(
-        '--test_varlen', action='store_true', help='Test with truly variable sequence lengths')
-    parser.add_argument(
-        '--test_sink', action='store_true', help='Test with sink attention mechanism')
-    parser.add_argument('--benchmark', action='store_true', help='Run speed benchmark')
-    parser.add_argument(
-        '--num_split', type=int, default=1, choices=[1, 16], help='Number of splits')
+    parser = argparse.ArgumentParser(description="Flash Attention Decode with Attention Pooling")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--q_heads", type=int, default=32, help="Number of query heads")
+    parser.add_argument("--kv_heads", type=int, default=8, help="Number of key-value heads")
+    parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
+    parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
+    parser.add_argument("--block_size", type=int, default=64, help="Block size for computation")
+    parser.add_argument("--dtype", type=str, default=T.bfloat16, choices=[T.float16, T.bfloat16], help="Data type")
+    parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
+    parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
+    parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
+    parser.add_argument("--num_split", type=int, default=1, choices=[1, 16], help="Number of splits")
     args = parser.parse_args()
     args.test_sink = True
     args.test_varlen = False
-    args.dtype = 'float16'
+    args.dtype = T.float16
     args.num_split = 1
 
     if args.benchmark:
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
new file mode 100644
index 0000000000000000000000000000000000000000..0984e707531fc49e3f7c2130b1299b9826c4ea53
--- /dev/null
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
@@ -0,0 +1,679 @@
+import torch
+import math
+import argparse
+import tilelang
+import tilelang.language as T
+from example_gqa_decode_varlen_logits import flash_attn_with_attn_pool_decode, repeat_kv, do_bench
+
+torch.manual_seed(0)
+
+
+def get_configs():
+    import itertools
+
+    block_N = [64, 128]
+    block_H = [64]
+    num_split = [1]
+    num_stages = [1, 2, 3]
+    threads = [128]
+    _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
+
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
+    return configs
+
+
+# @autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(out_idx=[-2, -1], debug_root_path="./examples/flash_decoding")
+def flashattn(
+    batch,
+    heads,
+    k_heads,
+    max_seqlen_kv,
+    total_seqlen_k,
+    dim,
+    has_sink,
+    page_block_size,
+    block_N=128,
+    block_H=64,
+    num_split=1,
+    num_stages=1,
+    threads=128,
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    shape_q = [batch, heads, dim]
+    shape_k = [total_seqlen_k, k_heads, dim]
+    shape_v = [total_seqlen_k, k_heads, dim]
+    shape_o = [batch, heads, dim]
+    shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
+    dtype = T.float16
+    accum_dtype = T.float32
+    kv_group_num = heads // k_heads
+    assert page_block_size >= block_N and page_block_size % block_N == 0, (
+        "page_block_size must be larger than block_N and a multiple of block_N"
+    )
+
+    valid_block_H = min(block_H, kv_group_num)
+    # TODO: check if max_seqlen_kv is correct for varlen case
+
+    @T.macro
+    def flash_attn(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / block_N)], T.int32),
+        Output: T.Tensor([batch, heads, dim], dtype),
+        S: T.Tensor(shape_s, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([valid_block_H, dim], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
+            s_aux_shared = T.alloc_shared([block_H], T.float32)
+
+            bid = bx
+            hid = by
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            cur_start_k = cu_seqlens_k[bid]
+            cur_end_k = cu_seqlens_k[bid + 1]
+            cur_seqlen_k = cur_end_k - cur_start_k
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                k_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (k * block_N) % page_block_size
+                T.copy(K[cur_start_k + k_start : cur_start_k + k_start + block_N, cur_kv_head, :], K_shared)
+                T.clear(acc_s)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                # scores_max_prev is m_i
+                # scores_max is row_max->m_ij in triton
+                T.copy(scores_max, S_shared[:, k])
+                # scores_scale is alpha in triton
+                for i in T.Parallel(block_H):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                # scores_sum is l_ij in triton
+                # logsum is l_i in triton
+                for i in T.Parallel(block_H):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+                for i, j in T.Parallel(block_H, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                v_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (k * block_N) % page_block_size
+                T.copy(V[cur_start_k + v_start : cur_start_k + v_start + block_N, cur_kv_head, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            if has_sink:
+                T.copy(s_aux[hid * valid_block_H : hid * valid_block_H + block_H], s_aux_shared)
+                for i in T.Parallel(block_H):
+                    logsum[i] += s_aux_shared[i]
+            for i, j in T.Parallel(block_H, dim):
+                acc_o[i, j] /= logsum[i]
+            for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
+                S_shared[h, k] = T.exp2((S_shared[h, k] - scores_max[h]) * scale) / logsum[h]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(acc_o[:valid_block_H, :], O_shared)
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+            T.copy(S_shared[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+
+    @T.prim_func
+    def flashattn_gqa_decode_no_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / page_block_size)], T.int32),
+        Output: T.Tensor(shape_o, dtype),
+        S: T.Tensor(shape_s, dtype),
+    ):
+        flash_attn(Q, K, V, cu_seqlens_k, s_aux, BLOCK_TABLE, Output, S)
+
+    # TODO: split version
+    return flashattn_gqa_decode_no_split
+
+
+def flash_attn_with_attn_pool_decode_tilelang(
+    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
+    K: torch.Tensor,  ## [tk, k_h, k_dim]
+    V: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_k: int,
+    real_max_k_seqlen: int,
+    num_split: int,
+    softmax_scale: float,
+    s_aux: torch.Tensor = None,
+    block_size: int = 64,
+    use_per_kv_head_sparse_index: bool = False,
+    tl_kernel=None,
+    block_table: torch.Tensor = None,
+):
+    num_tokens, q_h, head_size = Q.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = K.size(1)
+
+    assert Q.dim() == K.dim() == 3
+    assert Q.size(2) == K.size(2)
+    assert cu_seqlens_k.dim() == 1
+    assert head_size in {64, 128, 256}
+    assert Q.is_contiguous()
+    assert K.is_contiguous()
+    assert V.is_contiguous()
+
+    gqa_group_size = q_h // k_h
+
+    O_tl = torch.zeros_like(Q)
+    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)), dtype=Q.dtype, device=Q.device)
+    O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux, block_table)
+
+    if use_per_kv_head_sparse_index:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
+    else:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(q_h, 1), stride=(q_h, 1))
+
+    return O_tl, S_tl
+
+
+def test_equal_seqlen_decode_main(args):
+    """Test decode kernel with equal sequence lengths"""
+    print("Testing decode kernel with equal sequence lengths")
+
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    k_seqlen = args.k_seqlen
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
+
+    # For decode, query is just 1 token per batch
+    q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
+    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
+    softmax_scale = 1.0 / math.sqrt(head_size)
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
+        print(f"Using sink attention with sink values: {sink}")
+
+    # Convert to varlen format for K, V
+    k_varlen = k.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size).contiguous()
+    v_varlen = v.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size).contiguous()
+
+    # Generate cumulative sequence lengths
+    cu_seqlens_k = torch.arange(0, (batch_size + 1) * k_seqlen, k_seqlen, device="cuda", dtype=torch.int32)
+    max_seqlen_k = k_seqlen
+
+    print(f"q shape: {q.shape}")
+    print(f"k_varlen shape: {k_varlen.shape}")
+    print(f"v_varlen shape: {v_varlen.shape}")
+
+    num_tokens, q_h, head_size = q.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
+
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Test our decode kernel
+    O_triton, S_triton = flash_attn_with_attn_pool_decode(
+        q,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+    )
+    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
+        q,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+        tl_kernel=tl_kernel,
+        block_table=block_table,
+    )
+    for i in range(batch_size):
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
+
+    # Compute torch reference
+    q_expanded = q.unsqueeze(2)  # [b, q_heads, 1, head_size]
+    k_repeat = repeat_kv(k, q_heads // kv_heads)  # [b, q_heads, k_seqlen, head_size]
+    v_repeat = repeat_kv(v, q_heads // kv_heads)  # [b, q_heads, k_seqlen, head_size]
+
+    if sink is None:
+        # Standard scaled dot-product attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        attn_weights = torch.softmax(logits, dim=-1)
+        O_torch = torch.matmul(attn_weights, v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+    else:
+        # s_aux attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+
+        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
+        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
+        attn_weights = unnormalized_scores / normalizer
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+
+    # Compute attention score pooling
+    attn_score_pooled = torch.max_pool2d(
+        attn_weights.squeeze(2),  # [b, q_heads, k_seqlen]
+        kernel_size=(q_heads, block_size),
+        stride=(q_heads, block_size),
+        ceil_mode=True,
+    ).to(torch.float16)
+
+    print("S_tilelang", S_tilelang)
+    print("attn_score_pooled", attn_score_pooled)
+
+    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
+    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
+    max_diff_o_tilelang = torch.max(torch.abs(O_tilelang - O_torch))
+    max_diff_s_tilelang = torch.max(torch.abs(S_tilelang - attn_score_pooled))
+
+    print(f"Max difference in O: {max_diff_o.item()}")
+    print(f"Max difference in S: {max_diff_s.item()}")
+    print(f"Max difference in O_tilelang: {max_diff_o_tilelang.item()}")
+    print(f"Max difference in S_tilelang: {max_diff_s_tilelang.item()}")
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
+    assert torch.allclose(S_tilelang, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
+    print("✅ All tests passed!")
+
+
+def test_varlen_decode_main(args):
+    """Test decode kernel with variable sequence lengths"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen  # Use as max sequence length
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
+
+    print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
+        print(f"Using sink attention with sink values: {sink}")
+
+    # Generate variable length k sequences
+    k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    print(f"k_seqlens: {k_seqlens}")
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    print(f"cu_seqlens_k: {cu_seqlens_k}")
+
+    # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    print(f"Actual max_seqlen_k: {max_seqlen_k}")
+    print(f"q_decode shape: {q_decode.shape}")
+    print(f"k_varlen shape: {k_varlen.shape}")
+    print(f"v_varlen shape: {v_varlen.shape}")
+
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
+
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Test our decode kernel
+    O_triton, S_triton = flash_attn_with_attn_pool_decode(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+    )
+    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+        tl_kernel=tl_kernel,
+        block_table=block_table,
+    )
+    for i in range(batch_size):
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
+
+    # Create torch reference - pad tensors for comparison
+    k_padded_list = []
+    v_padded_list = []
+
+    for i in range(batch_size):
+        actual_k_len = k_seqlens[i]
+
+        # Extract and pad k, v for this batch
+        k_start = cu_seqlens_k[i]
+        k_end = cu_seqlens_k[i + 1]
+
+        # Pad to max_seqlen_k
+        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+
+        k_padded[:actual_k_len] = k_varlen[k_start:k_end]
+        v_padded[:actual_k_len] = v_varlen[k_start:k_end]
+
+        k_padded_list.append(k_padded)
+        v_padded_list.append(v_padded)
+
+    # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
+    k_padded_batched = torch.stack(k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    v_padded_batched = torch.stack(v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+
+    # Expand q to match kv heads: [b, q_heads, 1, head_size]
+    q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
+
+    print(f"q_expanded shape: {q_expanded.shape}")
+    print(f"k_padded_batched shape: {k_padded_batched.shape}")
+    print(f"v_padded_batched shape: {v_padded_batched.shape}")
+
+    # Compute torch reference
+    k_repeat = repeat_kv(k_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    v_repeat = repeat_kv(v_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+
+    if sink is None:
+        # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
+        attn_score = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_score[i, :, :, actual_k_len:] = float("-inf")
+
+        attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
+    else:
+        # s_aux attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            logits[i, :, :, actual_k_len:] = float("-inf")
+
+        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
+        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
+        attn_weights = unnormalized_scores / normalizer
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat)  # [b, q_heads, 1, head_size]
+
+    O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
+
+    # Compute attention score pooling for S
+    attn_score_pooled = torch.max_pool2d(
+        attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
+        kernel_size=(q_heads, block_size),
+        stride=(q_heads, block_size),
+        ceil_mode=True,
+    ).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
+
+    print(f"O_triton shape: {O_triton.shape}")
+    print(f"O_tilelang shape: {O_tilelang.shape}")
+    print(f"O_torch shape: {O_torch.shape}")
+    print(f"S_triton shape: {S_triton.shape}")
+    print(f"S_tilelang shape: {S_tilelang.shape}")
+    print(f"attn_score_pooled shape: {attn_score_pooled.shape}")
+
+    # Compare results
+    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
+    max_diff_o_tl = torch.max(torch.abs(O_tilelang - O_torch))
+    print(f"Max difference in O: {max_diff_o.item()}")
+    print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
+
+    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
+    max_diff_s_tl = torch.max(torch.abs(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
+    print(f"Max difference in S: {max_diff_s.item()}")
+    print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
+
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
+    assert torch.allclose(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)], attn_score_pooled, atol=1e-2, rtol=1e-2), (
+        f"Score mismatch: {max_diff_s_tl.item()}"
+    )
+
+    print("✅ All tests passed!")
+
+
+def speed_benchmark_decode_comparison(args):
+    """Speed benchmark for decode kernel"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
+
+    print("\n=== Decode Speed Benchmark Comparison ===")
+    print("Configuration:")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Q heads: {q_heads}, KV heads: {kv_heads}")
+    print(f"  Max K sequence length: {max_k_seqlen}")
+    print(f"  Head size: {head_size}")
+    print(f"  Block size: {block_size}")
+    print(f"  Data type: {dtype}")
+    print(f"  Variable lengths: {args.test_varlen}")
+    print(f"  s_aux attention: {args.test_sink}")
+    print()
+
+    # Generate input data
+    if args.test_varlen:
+        k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    else:
+        k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    # Generate tensors
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
+        print("  Using sink attention with sink values")
+
+    print("Setup complete:")
+    print(f"  Total K tokens: {total_k_tokens}")
+    print(f"  Actual max K seq len: {max_seqlen_k}")
+    if args.test_varlen:
+        print(f"  K sequence lengths: {k_seqlens.tolist()}")
+
+    # Warmup
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
+
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Benchmark
+    print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
+    tilelang_time = do_bench(
+        flash_attn_with_attn_pool_decode_tilelang,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+        False,
+        tl_kernel,
+        block_table,
+    )
+    print(f"Average decode kernel time Tilelang: {tilelang_time:.3f} ms")
+
+    # Benchmark
+    print("⚡ Benchmarking Triton kernel (100 iterations)...")
+    triton_time = do_bench(
+        flash_attn_with_attn_pool_decode,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+    )
+    print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
+    print(f"Speedup: {(triton_time / tilelang_time):.3f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Flash Attention Decode with Attention Pooling")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--q_heads", type=int, default=32, help="Number of query heads")
+    parser.add_argument("--kv_heads", type=int, default=8, help="Number of key-value heads")
+    parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
+    parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
+    parser.add_argument("--block_size", type=int, default=128, help="Block size for computation")
+    parser.add_argument("--dtype", type=str, default=T.bfloat16, choices=[T.float16, T.bfloat16], help="Data type")
+    parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
+    parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
+    parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
+    parser.add_argument("--num_split", type=int, default=1, choices=[1, 16], help="Number of splits")
+    parser.add_argument("--page_block_size", type=int, default=128, help="Page block size")
+    args = parser.parse_args()
+    args.test_sink = True
+    args.test_varlen = True
+    args.dtype = T.float16
+    args.num_split = 1
+
+    if args.benchmark:
+        speed_benchmark_decode_comparison(args)
+    elif args.test_varlen:
+        test_varlen_decode_main(args)
+    else:
+        test_equal_seqlen_decode_main(args)
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index 3eabc9a76cccf4075ca817dc7e39be386095d553..5b243d695eefd8bb2d0fa0f18e30986d96ca7135 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -10,12 +10,12 @@ num_split = 4
 
 @tilelang.jit(out_idx=[5])
 def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, seqlen_q, heads, dim]
     shape_kv = [batch, seqlen_kv, heads, dim]
     part_shape = [batch, seqlen_q, heads, num_split, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
@@ -29,14 +29,11 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
         bid: T.int32,
         sid: T.int32,
     ):
-        T.copy(
-            K[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], K_shared)
+        T.copy(K[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :], K_shared)
         # TODO: Handle causal split case
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
             T.clear(acc_s)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -52,24 +49,24 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
         bid: T.int32,
         sid: T.int32,
     ):
-        T.copy(
-            V[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], V_shared)
+        T.copy(V[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -89,23 +86,21 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_kv, dtype),
+        V: T.Tensor(shape_kv, dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(seqlen_q, block_M), heads * batch, num_split,
-                threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(seqlen_q, block_M), heads * batch, num_split, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -126,39 +121,36 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
 
             # NOTE(wt): tma barrier has some problems with padded dimensions (seq_q here) currently
             # disable relevant tma copy and use SIMT as fallback for now
-            T.copy(Q[bid, mid * block_M:(mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
+            T.copy(Q[bid, mid * block_M : (mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             # TODO: Handle causal split case
             loop_range = (
-                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv(
-                    (mid + 1) * block_M, block_N)) if is_causal else T.ceildiv(
-                        (seqlen_kv // num_split), block_N))
+                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv((mid + 1) * block_M, block_N))
+                if is_causal
+                else T.ceildiv((seqlen_kv // num_split), block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=2):
                 MMA0(K, Q_shared, K_shared, acc_s, k, mid, hid, bid, sid)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, hid, bid, sid)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid, sid, mid * block_M:(mid + 1) * block_M])
+            T.copy(logsum, glse[bid, hid, sid, mid * block_M : (mid + 1) * block_M])
             T.copy(acc_o, O_shared)
-            T.copy(
-                O_shared,
-                Output_partial[bid, mid * block_M:(mid + 1) * block_M, hid, sid, :],
-                disable_tma=True)
+            T.copy(O_shared, Output_partial[bid, mid * block_M : (mid + 1) * block_M, hid, sid, :], disable_tma=True)
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_q, dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_q, dtype),
     ):
         with T.Kernel(T.ceildiv(seqlen_q, block_M), heads, batch, threads=128) as (bx, by, bz):
             po_local = T.alloc_fragment([block_M, dim], dtype)
@@ -171,20 +163,25 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
             lse_max_local = T.alloc_fragment([block_M], accum_dtype)
             scale_local = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({
-                o_accum_local: T.Fragment(o_accum_local.shape, forward_thread_fn=lambda i, j: i),
-                o_shared: tilelang.layout.make_swizzled_layout(o_shared),
-                po_shared: tilelang.layout.make_swizzled_layout(po_shared),
-            })
+            T.annotate_layout(
+                {
+                    o_accum_local: T.Fragment(o_accum_local.shape, forward_thread_fn=lambda i, j: i),
+                    o_shared: tilelang.layout.make_swizzled_layout(o_shared),
+                    po_shared: tilelang.layout.make_swizzled_layout(po_shared),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
-            T.copy(glse[
-                bz,
-                by,
-                :,
-                bx * block_M:(bx + 1) * block_M,
-            ], lse_local)
+            T.copy(
+                glse[
+                    bz,
+                    by,
+                    :,
+                    bx * block_M : (bx + 1) * block_M,
+                ],
+                lse_local,
+            )
             T.reduce_max(lse_local, lse_max_local, dim=0, clear=False)
             for k in T.Pipelined(num_split):
                 T.copy(lse_local[k, :], lse_local_split)
@@ -193,10 +190,7 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
             for i in T.Parallel(block_M):
                 lse_logsum_local[i] = T.log2(lse_logsum_local[i]) + lse_max_local[i]
             for k in T.Pipelined(num_split, num_stages=2):
-                T.copy(
-                    Output_partial[bz, bx * block_M:(bx + 1) * block_M, by, k, :],
-                    po_shared,
-                    disable_tma=True)
+                T.copy(Output_partial[bz, bx * block_M : (bx + 1) * block_M, by, k, :], po_shared, disable_tma=True)
                 T.copy(po_shared, po_local)
                 for i in T.Parallel(block_M):
                     lse_local_split[i] = lse_local[k, i]
@@ -205,16 +199,16 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
                 for i, j in T.Parallel(block_M, dim):
                     o_accum_local[i, j] += po_local[i, j] * scale_local[i]
             T.copy(o_accum_local, o_shared)
-            T.copy(o_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :], disable_tma=True)
+            T.copy(o_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :], disable_tma=True)
 
     @T.prim_func
     def flashattn_mha_inference(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
-            Output: T.Tensor(shape_q, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_kv, dtype),
+        V: T.Tensor(shape_kv, dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
+        Output: T.Tensor(shape_q, dtype),
     ):
         flash_attn_split(Q, K, V, glse, Output_partial)
         combine(glse, Output_partial, Output)
@@ -225,10 +219,10 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
 def ref_program(Q, K, V, glse, Output_partial, causal):
     assert causal is False
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -256,7 +250,7 @@ def flash_split_ref(Q, K, V, causal):
     block_N = 128
     seqlen_kv = K.size(1)
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, block_M, nheads, dim), device="cuda", dtype=torch.float)
@@ -273,14 +267,15 @@ def flash_split_ref(Q, K, V, causal):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bqhd,bkhd->bhqk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, seqlen, nheads, block_N]
+            acc_s = torch.einsum(
+                "bqhd,bkhd->bhqk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, seqlen, nheads, block_N]
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [blockM]
             scores_scale = torch.exp2(scores_max_prev - scores_max)
@@ -288,9 +283,10 @@ def flash_split_ref(Q, K, V, causal):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)
             acc_o += torch.einsum(
-                'bhqk,bkhd->bqhd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bhqk,bkhd->bqhd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, :, None].transpose(1, 2)
@@ -298,8 +294,7 @@ def flash_split_ref(Q, K, V, causal):
         gacc_o[ks, :, :, :, :] = acc_o
         glogsum[ks, :, :, :] = logsum
 
-    return glogsum.to(torch.float16).permute(1, 2, 0,
-                                             3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
+    return glogsum.to(torch.float16).permute(1, 2, 0, 3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
 
 
 def main(BATCH=1, H=32, Q_CTX=128, KV_CTX=8192, D_HEAD=128, causal=False):
diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
index a8d6849659118cb630ab0d97dad7b0233abec0b7..36c6ef3dc20004e9ac0076d2f6bc7680e5c33371 100644
--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -9,17 +9,18 @@ from example_fusedmoe_torch import *
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_shared(d_hidden,
-                                d_expert,
-                                n_shared_experts,
-                                dtype,
-                                num_tokens,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1):
-
+def moe_forward_tilelang_shared(
+    d_hidden,
+    d_expert,
+    n_shared_experts,
+    dtype,
+    num_tokens,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -32,21 +33,19 @@ def moe_forward_tilelang_shared(d_hidden,
     shared_W_up_shape = (dexpert, dhidden)
     shared_W_down_shape = (dhidden, dexpert)
 
-    accum_type = "float32"
+    accum_type = T.float32
 
     @T.prim_func
     def kernel_shared(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
-            shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
-            shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
-            up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
+        shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
+        shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
+        up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
             # Split the block to shared experts and routed experts
             input_shared = T.alloc_fragment((block_token, block_dhidden), dtype=dtype)
             W_gate_shared = T.alloc_shared((block_dexpert, block_dhidden), dtype=dtype)
@@ -70,16 +69,13 @@ def moe_forward_tilelang_shared(d_hidden,
 
             # Fuse with SiLU and element-wise product
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             T.copy(up_logits_local, up_logits[bx * block_token, by * block_dexpert])
 
         # Step 2: Compute down logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden), threads=threads) as (bx, by):
             up_logits_shared = T.alloc_fragment((block_token, block_dexpert), dtype=dtype)
             W_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_type)
@@ -98,20 +94,21 @@ def moe_forward_tilelang_shared(d_hidden,
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_routed(d_hidden,
-                                d_expert,
-                                n_routed_experts,
-                                dtype,
-                                group_sum,
-                                group_count,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1,
-                                k_pack=1,
-                                coalesced_width=None):
-
+def moe_forward_tilelang_routed(
+    d_hidden,
+    d_expert,
+    n_routed_experts,
+    dtype,
+    group_sum,
+    group_count,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+    k_pack=1,
+    coalesced_width=None,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -124,7 +121,7 @@ def moe_forward_tilelang_routed(d_hidden,
     # group_count = len(group_sizes_list)
     # M = sum([(group_size + block_token - 1) // block_token for group_size in group_sizes_list])
     M = math.ceil(group_sum / block_token) + group_count
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     # Tensors: Note that input shape is reshape to (bs * seq_len * n_experts_per_token, dhidden) for grouped gemm
     input_shape = (group_sum, dhidden)
@@ -132,22 +129,22 @@ def moe_forward_tilelang_routed(d_hidden,
     routed_expert_gate_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_up_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_down_shape = (n_routed_experts, dhidden, dexpert)
-    routed_expert_weights_shape = (group_sum)
-    group_sizes_shape = (n_routed_experts)
+    routed_expert_weights_shape = group_sum
+    group_sizes_shape = n_routed_experts
 
     @T.prim_func
     def kernel(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
-            routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
-            routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
-            routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
-            group_sizes: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_padded_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_idx_for_bx: T.Tensor((M,), "int32"),  # type: ignore
-            up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
+        routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
+        routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
+        routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
+        group_sizes: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_padded_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_idx_for_bx: T.Tensor((M,), T.int32),  # type: ignore
+        up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
         with T.Kernel(M, T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
@@ -158,8 +155,8 @@ def moe_forward_tilelang_routed(d_hidden,
             gate_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
             up_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
+            cur_group_idx = T.alloc_local([1], T.int32)
+            cur_group_size = T.alloc_local([1], T.int32)
 
             T.use_swizzle(10, enable=True)
 
@@ -168,48 +165,37 @@ def moe_forward_tilelang_routed(d_hidden,
             cur_group_idx[0] = group_idx_for_bx[bx]
 
             cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[cur_group_idx[0]]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size[0] - (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
 
             T.clear(gate_logits_local)
             T.clear(up_logits_local)
 
             for k in T.Pipelined(T.ceildiv(dhidden, block_dhidden), num_stages=num_stages):
                 T.copy(
-                    input[m_start:m_start + block_token, k * block_dhidden:(k + 1) * block_dhidden],
+                    input[m_start : m_start + block_token, k * block_dhidden : (k + 1) * block_dhidden],
                     input_shared,
-                    coalesced_width=coalesced_width)
+                    coalesced_width=coalesced_width,
+                )
                 T.copy(
-                    routed_expert_gate[cur_group_idx[0],
-                                       by * block_dexpert:(by + 1) * block_dexpert,
-                                       k * block_dhidden:(k + 1) * block_dhidden],
-                    routed_expert_gate_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
+                    routed_expert_gate[
+                        cur_group_idx[0], by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_gate_shared,
-                    gate_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(input_shared, routed_expert_gate_shared, gate_logits_local, k_pack=k_pack, transpose_B=True)
                 T.copy(
-                    routed_expert_up[cur_group_idx[0], by * block_dexpert:(by + 1) * block_dexpert,
-                                     k * block_dhidden:(k + 1) * block_dhidden],
+                    routed_expert_up[
+                        cur_group_idx[0], by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_up_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
-                    routed_expert_up_shared,
-                    up_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(input_shared, routed_expert_up_shared, up_logits_local, k_pack=k_pack, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             for i, j in T.Parallel(block_token, block_dexpert):
@@ -222,8 +208,8 @@ def moe_forward_tilelang_routed(d_hidden,
             routed_expert_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
+            cur_group_idx = T.alloc_local([1], T.int32)
+            cur_group_size = T.alloc_local([1], T.int32)
 
             T.use_swizzle(10, enable=True)
 
@@ -232,50 +218,35 @@ def moe_forward_tilelang_routed(d_hidden,
             cur_group_idx[0] = group_idx_for_bx[bx]
 
             cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[cur_group_idx[0]]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size[0] - (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
 
             T.clear(output_local)
 
             for k in T.Pipelined(T.ceildiv(dexpert, block_dexpert), num_stages=num_stages):
                 T.copy(
-                    up_logits[m_start:m_start + block_token,
-                              k * block_dexpert:(k + 1) * block_dexpert],
+                    up_logits[m_start : m_start + block_token, k * block_dexpert : (k + 1) * block_dexpert],
                     up_logits_shared,
-                    coalesced_width=coalesced_width)
+                    coalesced_width=coalesced_width,
+                )
                 T.copy(
-                    routed_expert_down[cur_group_idx[0],
-                                       by * block_dhidden:(by + 1) * block_dhidden,
-                                       k * block_dexpert:(k + 1) * block_dexpert],
-                    routed_expert_down_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    up_logits_shared,
+                    routed_expert_down[
+                        cur_group_idx[0], by * block_dhidden : (by + 1) * block_dhidden, k * block_dexpert : (k + 1) * block_dexpert
+                    ],
                     routed_expert_down_shared,
-                    output_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(up_logits_shared, routed_expert_down_shared, output_local, k_pack=k_pack, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dhidden):
                 if i < actual_rows:
-                    output[m_start + i, by * block_dhidden +
-                           j] = output_local[i, j] * routed_expert_weights[m_start + i]
+                    output[m_start + i, by * block_dhidden + j] = output_local[i, j] * routed_expert_weights[m_start + i]
 
     return kernel
 
 
 class Expert(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 gate: torch.Tensor,
-                 up: torch.Tensor,
-                 down: torch.Tensor,
-                 d_expert: Optional[int] = None):
+    def __init__(self, config: Dict, gate: torch.Tensor, up: torch.Tensor, down: torch.Tensor, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
         self.act_fn = nn.SiLU()
@@ -294,14 +265,13 @@ class Expert(nn.Module):
 
 
 class MoEGate(nn.Module):
-
     def __init__(self, config: Dict, weights: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
         self.num_experts: int = config["n_routed_experts"]
         self.d_hidden: int = config["d_hidden"]
 
-        self.W_g_weight = weights['router.weight'].t()
+        self.W_g_weight = weights["router.weight"].t()
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         logits = x @ self.W_g_weight
@@ -312,76 +282,69 @@ class MoEGate(nn.Module):
 
 
 class MoE(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 shared_kernel: tilelang.JITKernel,
-                 routed_kernel: tilelang.JITKernel,
-                 weights: Dict,
-                 padding_M: int = 128):
+    def __init__(
+        self, config: Dict, shared_kernel: tilelang.JITKernel, routed_kernel: tilelang.JITKernel, weights: Dict, padding_M: int = 128
+    ):
         super().__init__()
         self.config = config
         self.shared_kernel = shared_kernel
         self.routed_kernel = routed_kernel
         self.padding_M = padding_M
-        self.experts = nn.ModuleList([
-            Expert(
-                config,
-                gate=weights[f'experts.{i}.0.weight'],
-                up=weights[f'experts.{i}.1.weight'],
-                down=weights[f'experts.{i}.2.weight']) for i in range(config["n_routed_experts"])
-        ])
+        self.experts = nn.ModuleList(
+            [
+                Expert(
+                    config,
+                    gate=weights[f"experts.{i}.0.weight"],
+                    up=weights[f"experts.{i}.1.weight"],
+                    down=weights[f"experts.{i}.2.weight"],
+                )
+                for i in range(config["n_routed_experts"])
+            ]
+        )
         self.device = torch.device("cuda")
         self.gating_network = MoEGate(config, weights).to(self.device)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = Expert(
             config=config,
-            gate=weights['shared_experts.0.weight'],
-            up=weights['shared_experts.1.weight'],
-            down=weights['shared_experts.2.weight'],
-            d_expert=shared_expert_dim).to(self.device)
+            gate=weights["shared_experts.0.weight"],
+            up=weights["shared_experts.1.weight"],
+            down=weights["shared_experts.2.weight"],
+            d_expert=shared_expert_dim,
+        ).to(self.device)
         self.expert_cache = torch.zeros(
-            (config["batch_size"] * config["seq_len"], config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
-        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts],
-                                                 dim=0)
-        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts],
-                                               dim=0)
-        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts],
-                                                 dim=0)
+            (config["batch_size"] * config["seq_len"], config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
+        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts], dim=0)
         self.stacked_expert_tokens = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.stacked_expert_weights = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.float16, device=self.device
+        )
         self.stacked_expert_tokens_idxs = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.int64,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.int64, device=self.device
+        )
 
         self.up_logits_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_expert"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_expert"]), dtype=torch.float16, device=self.device
+        )
         self.expert_output_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
         self.up_logits_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_expert"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_expert"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.expert_output_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
 
     @torch.no_grad()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -413,22 +376,20 @@ class MoE(nn.Module):
 
             self.stacked_expert_tokens[start_idx:end_idx] = expert_tokens
             self.stacked_expert_tokens_idxs[start_idx:end_idx] = exp_token_idxs
-            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[
-                idxs[start_idx:end_idx]]
+            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[idxs[start_idx:end_idx]]
 
         group_sizes = torch.tensor(counts, dtype=torch.int32, device=self.device)
-        group_offset = torch.tensor(
-            tokens_per_expert - counts, dtype=torch.int32, device=self.device)
+        group_offset = torch.tensor(tokens_per_expert - counts, dtype=torch.int32, device=self.device)
 
         group_padded_offsets = [0 for _ in range(len(group_sizes))]
         for i in range(1, len(group_sizes)):
-            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil(
-                (counts[i - 1] + 1) / self.padding_M) * self.padding_M
+            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil((counts[i - 1] + 1) / self.padding_M) * self.padding_M
 
         block_token = 128
-        M = math.ceil(
-            self.config["batch_size"] * self.config["seq_len"] *
-            self.config["n_experts_per_token"] / block_token) + self.config["n_routed_experts"]
+        M = (
+            math.ceil(self.config["batch_size"] * self.config["seq_len"] * self.config["n_experts_per_token"] / block_token)
+            + self.config["n_routed_experts"]
+        )
         group_idx_for_bx = [0 for _ in range(M)]
 
         for bx in range(M):
@@ -437,8 +398,7 @@ class MoE(nn.Module):
                 if m_start_padded >= group_padded_offsets[i]:
                     group_idx_for_bx[bx] = i
 
-        group_padded_offsets = torch.tensor(
-            group_padded_offsets, dtype=torch.int32, device=self.device)
+        group_padded_offsets = torch.tensor(group_padded_offsets, dtype=torch.int32, device=self.device)
         group_idx_for_bx = torch.tensor(group_idx_for_bx, dtype=torch.int32, device=self.device)
 
         # Multi-stream execution
@@ -448,11 +408,19 @@ class MoE(nn.Module):
 
         with torch.cuda.stream(routed_stream):
             # Tilelang version: Grouped GEMM
-            self.routed_kernel(self.stacked_expert_tokens, self.stacked_expert_w_gate,
-                               self.stacked_expert_w_up, self.stacked_expert_w_down,
-                               self.stacked_expert_weights, group_sizes, group_offset,
-                               group_padded_offsets, group_idx_for_bx, self.up_logits_routed,
-                               self.expert_output_routed)
+            self.routed_kernel(
+                self.stacked_expert_tokens,
+                self.stacked_expert_w_gate,
+                self.stacked_expert_w_up,
+                self.stacked_expert_w_down,
+                self.stacked_expert_weights,
+                group_sizes,
+                group_offset,
+                group_padded_offsets,
+                group_idx_for_bx,
+                self.up_logits_routed,
+                self.expert_output_routed,
+            )
 
             # Scatter reduce
             self.expert_cache = torch.scatter_reduce(
@@ -460,14 +428,19 @@ class MoE(nn.Module):
                 0,
                 self.stacked_expert_tokens_idxs.view(-1, 1).repeat(1, x_flat.shape[-1]),
                 self.expert_output_routed,
-                reduce='sum')
+                reduce="sum",
+            )
             routed_output = self.expert_cache.view(*orig_shape)
 
         with torch.cuda.stream(shared_stream):
-
-            self.shared_kernel(x_flat, self.shared_expert.W_gate_weight,
-                               self.shared_expert.W_up_weight, self.shared_expert.W_down_weight,
-                               self.up_logits_shared, self.expert_output_shared)
+            self.shared_kernel(
+                x_flat,
+                self.shared_expert.W_gate_weight,
+                self.shared_expert.W_up_weight,
+                self.shared_expert.W_down_weight,
+                self.up_logits_shared,
+                self.expert_output_shared,
+            )
             shared_output = self.expert_output_shared.view(*orig_shape)
 
         torch.cuda.synchronize()
@@ -491,14 +464,15 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     """
     input_tensor, weights, config = data
 
-    dtype_str = "float16"
+    dtype_str = T.float16
 
     shared_kernel = moe_forward_tilelang_shared(
         config["d_hidden"],
         config["d_expert"],
         config["n_shared_experts"],
         dtype=dtype_str,
-        num_tokens=config["batch_size"] * config["seq_len"])
+        num_tokens=config["batch_size"] * config["seq_len"],
+    )
     routed_kernel = moe_forward_tilelang_routed(
         config["d_hidden"],
         config["d_expert"],
@@ -512,7 +486,8 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
         threads=256,
         num_stages=1,
         k_pack=1,
-        coalesced_width=2)
+        coalesced_width=2,
+    )
 
     moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
 
@@ -521,13 +496,7 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     return output
 
 
-def main(d_hidden=7168,
-         d_expert=2048,
-         n_routed_experts=8,
-         n_shared_experts=1,
-         n_experts_per_token=4,
-         batch_size=1,
-         seq_len=8192):
+def main(d_hidden=7168, d_expert=2048, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=8192):
     config = {
         "dhidden": d_hidden,
         "dexpert": d_expert,
@@ -536,7 +505,7 @@ def main(d_hidden=7168,
         "nexpertspertoken": n_experts_per_token,
         "bs": batch_size,
         "seqlen": seq_len,
-        "seed": 81394
+        "seed": 81394,
     }
 
     data = generate_input(**config)
diff --git a/examples/fusedmoe/example_fusedmoe_torch.py b/examples/fusedmoe/example_fusedmoe_torch.py
index 00219c6e94b54070d14b8e98715ff6173b04b00e..6b6322aff7dce196ce12f371d83f47e5c1fa82e4 100644
--- a/examples/fusedmoe/example_fusedmoe_torch.py
+++ b/examples/fusedmoe/example_fusedmoe_torch.py
@@ -6,7 +6,6 @@ from typing import Dict, Tuple, Optional
 
 # Reference code in PyTorch
 class ExpertTorch(nn.Module):
-
     def __init__(self, config: Dict, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
@@ -25,7 +24,6 @@ class ExpertTorch(nn.Module):
 
 
 class MoEGateTorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
@@ -43,12 +41,10 @@ class MoEGateTorch(nn.Module):
 
 
 class MoETorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.config = config
-        self.experts = nn.ModuleList(
-            [ExpertTorch(config) for _ in range(config["n_routed_experts"])])
+        self.experts = nn.ModuleList([ExpertTorch(config) for _ in range(config["n_routed_experts"])])
         self.gating_network = MoEGateTorch(config)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = ExpertTorch(config=config, d_expert=shared_expert_dim)
@@ -67,8 +63,7 @@ class MoETorch(nn.Module):
         return routed_output + shared_output
 
     @torch.no_grad()
-    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor,
-                  flat_expert_weights: torch.Tensor) -> torch.Tensor:
+    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor, flat_expert_weights: torch.Tensor) -> torch.Tensor:
         expert_cache = torch.zeros_like(x)
         # test_expert_cache = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
         # test_expert_tokens = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
@@ -91,8 +86,7 @@ class MoETorch(nn.Module):
             expert_out = expert(expert_tokens)
 
             expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
-            expert_cache.scatter_reduce_(
-                0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
+            expert_cache.scatter_reduce_(0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce="sum")
 
         return expert_cache
 
@@ -116,21 +110,21 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     moe = MoETorch(config)
 
     # Fill in the given weights of the model
-    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+    moe.gating_network.W_g.weight = nn.Parameter(weights["router.weight"])
 
     for i in range(num_experts):
-        gate_proj_weight = weights[f'experts.{i}.0.weight']
-        up_proj_weight = weights[f'experts.{i}.1.weight']
-        down_proj_weight = weights[f'experts.{i}.2.weight']
+        gate_proj_weight = weights[f"experts.{i}.0.weight"]
+        up_proj_weight = weights[f"experts.{i}.1.weight"]
+        down_proj_weight = weights[f"experts.{i}.2.weight"]
 
         # Transpose weights to match expected shape for nn.Linear
         moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
         moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
         moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
 
-    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
-    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
-    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights["shared_experts.0.weight"].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights["shared_experts.1.weight"].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights["shared_experts.2.weight"].t())
 
     output = moe(input_tensor)
 
@@ -140,10 +134,9 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
 # Input generation for the reference code
 
 
-def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int,
-                   nexpertspertoken: int, bs: int, seqlen: int,
-                   seed: int) -> Tuple[torch.Tensor, Dict, Dict]:
-
+def generate_input(
+    dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int, nexpertspertoken: int, bs: int, seqlen: int, seed: int
+) -> Tuple[torch.Tensor, Dict, Dict]:
     # Really dumb but for now _ isn't parsing correctly.
     d_hidden = dhidden
     d_expert = dexpert
@@ -163,50 +156,40 @@ def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexper
         "seq_len": seq_len,
     }
 
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
 
     num_experts = n_routed_experts
     expert_dim = d_expert
     weights = {}
 
-    input_tensor = torch.randn((batch_size, seq_len, d_hidden),
-                               device='cuda',
-                               dtype=torch.float16,
-                               generator=gen).contiguous()
+    input_tensor = torch.randn((batch_size, seq_len, d_hidden), device="cuda", dtype=torch.float16, generator=gen).contiguous()
 
     # Initialize router weights
-    weights['router.weight'] = torch.randn(
-        (num_experts, d_hidden), device="cuda", dtype=torch.float16,
-        generator=gen) / math.sqrt(d_hidden)
+    weights["router.weight"] = torch.randn((num_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen) / math.sqrt(d_hidden)
 
     for i in range(num_experts):
-        weights[f'experts.{i}.0.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.1.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.2.weight'] = torch.randn(
-            (expert_dim, d_hidden), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(d_hidden)
-
-    weights['shared_experts.0.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.1.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.2.weight'] = torch.randn((expert_dim * n_shared_experts, d_hidden),
-                                                     device='cuda',
-                                                     dtype=torch.float16,
-                                                     generator=gen) / math.sqrt(d_hidden)
+        weights[f"experts.{i}.0.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.1.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.2.weight"] = torch.randn(
+            (expert_dim, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(d_hidden)
+
+    weights["shared_experts.0.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.1.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.2.weight"] = torch.randn(
+        (expert_dim * n_shared_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(d_hidden)
 
     return (input_tensor, weights, config)
 
diff --git a/examples/fusedmoe/test_example_fusedmoe.py b/examples/fusedmoe/test_example_fusedmoe.py
index 806aff49ee3741816d9c687420553173829eccee..ba8415895d52f75dc8bf029b7a97e1cabc983b03 100644
--- a/examples/fusedmoe/test_example_fusedmoe.py
+++ b/examples/fusedmoe/test_example_fusedmoe.py
@@ -4,13 +4,8 @@ import example_fusedmoe_tilelang
 
 def test_example_fusedmoe_tilelang():
     example_fusedmoe_tilelang.main(
-        d_hidden=1024,
-        d_expert=256,
-        n_routed_experts=8,
-        n_shared_experts=1,
-        n_experts_per_token=4,
-        batch_size=1,
-        seq_len=1024)
+        d_hidden=1024, d_expert=256, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=1024
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
index 518b0ee21a62d1f6981d9bd2563b4b53eb853d45..39450bc5fc5917e1958687111ba684aabecf800d 100644
--- a/examples/gdn/example_chunk_delta_bwd.py
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -12,6 +12,7 @@ print(tilelang.__file__, flush=True)
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__, flush=True)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_bwd_dhu
 except ImportError:
@@ -24,7 +25,7 @@ import torch.nn.functional as F
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
 
-from utils import *
+from test_utils import assert_similar
 
 
 def prepare_input(
@@ -49,6 +50,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -125,8 +127,11 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
     DV = dv.shape[-1]
     block_S = 64
     BS = S // block_S
-    dh, dh0, dv2 = torch.empty((B, BS, H, DK, DV), dtype=output_dtype), torch.empty(
-        (B, H, DK, DV), dtype=state_dtype), torch.empty((B, S, H, DV), dtype=output_dtype)
+    dh, dh0, dv2 = (
+        torch.empty((B, BS, H, DK, DV), dtype=output_dtype),
+        torch.empty((B, H, DK, DV), dtype=state_dtype),
+        torch.empty((B, S, H, DV), dtype=output_dtype),
+    )
     dh_tmp = torch.empty((B, H, DK, DV), dtype=accum_dtype)
     dv_tmp = torch.empty((B, S, H, DV), dtype=accum_dtype)
     Q_tmp = torch.empty((B, S, H, DK), dtype=accum_dtype)
@@ -138,34 +143,30 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
 
     for i_s in range(BS - 1, -1, -1):
         dh[:, i_s, :, :, :] = dh_tmp
-        dv_tmp = torch.matmul(K[:, i_s * block_S:(i_s + 1) * block_S, :, :].permute(0, 2, 1, 3),
-                              dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
+        dv_tmp = torch.matmul(K[:, i_s * block_S : (i_s + 1) * block_S, :, :].permute(0, 2, 1, 3), dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
         if use_g:
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 for i_s2 in range(block_S):
-                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2,
-                                                                    i_h] <= 0:
-                        dv_tmp[i_b, i_s2,
-                               i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] -
-                                                    G[i_b, i_s * block_S + i_s2, i_h])
+                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h] <= 0:
+                        dv_tmp[i_b, i_s2, i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h])
                     else:
                         dv_tmp[i_b, i_s2, i_h, :] = 0
-        dv_tmp += dv[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dv2[:, i_s * block_S:(i_s + 1) * block_S, :, :] = dv_tmp
+        dv_tmp += dv[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dv2[:, i_s * block_S : (i_s + 1) * block_S, :, :] = dv_tmp
 
         if use_g:
             G_last = G[:, i_s * block_S + block_S - 1, :]
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 dh_tmp[i_b, i_h, :, :] *= torch.exp(G_last[i_b, i_h])
-            Q_tmp = Q[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+            Q_tmp = Q[:, i_s * block_S : (i_s + 1) * block_S, :, :]
             for i_s2 in range(block_S):
                 for i_k in range(DK):
                     Q_tmp[:, i_s2, :, i_k] *= torch.exp(G[:, i_s * block_S + i_s2, :])
         Q_tmp *= scale
-        W_tmp = W[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dO_tmp = dO[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+        W_tmp = W[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dO_tmp = dO[:, i_s * block_S : (i_s + 1) * block_S, :, :]
 
         torch.backends.cuda.matmul.allow_tf32 = True
         dh_tmp += torch.matmul(Q_tmp.permute(0, 2, 3, 1), dO_tmp.permute(0, 2, 1, 3))
@@ -223,19 +224,19 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
     @T.prim_func
     def kernel(
-            # Input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            h0: T.Tensor(h0_shape, dtype=input_dtype),
-            dht: T.Tensor(dht_shape, dtype=input_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            # Output
-            dh: T.Tensor(dh_shape, dtype=output_dtype),
-            dh0: T.Tensor(dh0_shape, dtype=state_dtype),
-            dv2: T.Tensor(dv2_shape, dtype=output_dtype),
+        # Input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        h0: T.Tensor(h0_shape, dtype=input_dtype),
+        dht: T.Tensor(dht_shape, dtype=input_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        # Output
+        dh: T.Tensor(dh_shape, dtype=output_dtype),
+        dh0: T.Tensor(dh0_shape, dtype=state_dtype),
+        dv2: T.Tensor(dv2_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
@@ -249,13 +250,13 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
             dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dv_fragment_2 = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
-            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype="float32")
-            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype="float32")
-            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype="float32")
+            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype=T.float32)
+            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype=T.float32)
+            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype=T.float32)
             K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
             Q_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
-            Q_shared_fp32 = T.alloc_shared((block_S, DK), dtype="float32")
+            Q_shared_fp32 = T.alloc_shared((block_S, DK), dtype=T.float32)
             W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
             G_last_local = T.alloc_local((1), dtype=gate_dtype)
@@ -269,20 +270,22 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                b_dh_shared: tilelang.layout.make_swizzled_layout(b_dh_shared),
-                b_dh_shared_fp32: tilelang.layout.make_swizzled_layout(b_dh_shared_fp32),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                dO_shared_t: tilelang.layout.make_swizzled_layout(dO_shared_t),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                Q_shared_fp32: tilelang.layout.make_swizzled_layout(Q_shared_fp32),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-            })
+            T.annotate_layout(
+                {
+                    b_dh_shared: tilelang.layout.make_swizzled_layout(b_dh_shared),
+                    b_dh_shared_fp32: tilelang.layout.make_swizzled_layout(b_dh_shared_fp32),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                    dO_shared_t: tilelang.layout.make_swizzled_layout(dO_shared_t),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                    Q_shared_fp32: tilelang.layout.make_swizzled_layout(Q_shared_fp32),
+                    W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                }
+            )
 
             if use_final_state_gradient:
-                T.copy(dht[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_dh_shared)
+                T.copy(dht[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_dh_shared)
                 T.copy(b_dh_shared, b_dh_fragment)
             else:
                 T.clear(b_dh_fragment)
@@ -293,17 +296,14 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
                 # Store the updated dh
                 T.copy(b_dh_fragment, b_dh_shared)
-                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dv
-                T.copy(K[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
                 T.gemm(K_shared, b_dh_shared, dv_fragment, clear_accum=True)
 
                 if use_g:
-                    T.copy(
-                        G[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh],
-                        G_shared,
-                        disable_tma=True)
+                    T.copy(G[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh], G_shared, disable_tma=True)
                     T.copy(G_shared, G_fragment)
                     G_last_local[0] = G_shared[block_S - 1]
                     G_last_local_exp[0] = T.exp(G_last_local[0])
@@ -313,27 +313,22 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
                         # with T.If(G_last_local[0] - G_shared[i_s2] <= 0):
                         with T.If(G_last_local[0] - G_fragment[i_s2] <= 0):
                             with T.Then():
-                                dv_fragment[i_s2,
-                                            i_v] = dv_fragment[i_s2, i_v] * G_fragment_post[i_s2]
+                                dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] * G_fragment_post[i_s2]
                             with T.Else():
                                 dv_fragment[i_s2, i_v] = 0
 
-                T.copy(
-                    dv[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dv_shared)
+                T.copy(dv[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dv_shared)
                 T.copy(dv_shared, dv_fragment_2)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] + dv_fragment_2[i_s2, i_v]
 
                 # Store the updated dv
                 T.copy(dv_fragment, dv_shared)
-                T.copy(
-                    dv_shared, dv2[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                                   bv * block_DV:(bv + 1) * block_DV])
+                T.copy(dv_shared, dv2[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dh
-                T.copy(Q[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
-                T.copy(W[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(Q[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
+                T.copy(W[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
 
                 T.clear(Q_fragment)
                 if use_g:
@@ -353,9 +348,7 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
                 for i_s2, i_k in T.Parallel(block_S, DK):
                     Q_fragment_t[i_k, i_s2] = Q_fragment[i_s2, i_k]
 
-                T.copy(
-                    dO[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dO_shared)
+                T.copy(dO[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dO_shared)
                 T.copy(dO_shared, dO_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dO_fragment_t[i_v, i_s2] = dO_fragment[i_s2, i_v]
@@ -369,7 +362,7 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
                     b_dh_fragment[i_k, i_v] += b_dh_fragment_1[i_k, i_v] - b_dh_fragment_2[i_k, i_v]
 
             if use_initial_state:
-                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -444,44 +437,61 @@ def run_test(
     num_stages=0,
     use_torch=False,
 ):
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    dh_ref, dh0_ref, dv2_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, gate_dtype),
-                                              getattr(torch, state_dtype))
-    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                             getattr(torch, output_dtype),
-                                                             getattr(torch, gate_dtype),
-                                                             getattr(torch, state_dtype))
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dh_ref, dh0_ref, dv2_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
     print("fla running...", flush=True)
     if use_g:
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
     else:
         G = G.fill_(0)
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
 
     # tilelang
     print("tilelang running...", flush=True)
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, scale, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     # kernel = tilelang.compile(program)
     print(kernel.get_kernel_source())
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)
 
-    fla_time = do_bench(
-        chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
+    fla_time = do_bench(chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
     tilelang_time = do_bench(kernel, Q, K, W, G, h0, dht, dO, dv)
 
     print(f"fla time: {fla_time} ms")
@@ -496,19 +506,47 @@ def run_test(
         print("torch running...", flush=True)
         if use_g:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, G, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                G,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
         else:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, None, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                None,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
@@ -554,11 +592,11 @@ def main():
         H=8,
         DK=DK,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         use_g=True,
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index 4d6b657ffc88345cf55a780a6f809f5a117ab13d..d316a62116c8b2fda7ca24daddc838e0cda94144 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -3,12 +3,14 @@
 import sys  # noqa: F401
 import tilelang
 import tilelang.language as T
+from tilelang.autotuner import autotune
 
 # Add your fla repository path to sys.path
 # Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h
 except ImportError:
@@ -19,7 +21,7 @@ import torch
 import torch.nn.functional as F
 from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
 
-from utils import *
+from test_utils import assert_similar
 
 # (zhengju) We can slightly modify the generated cuda code from tilelang lowering
 # in the debug folder to make the performance better. To enable this callback,
@@ -55,6 +57,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -80,7 +83,21 @@ def prepare_output(
     return h, final_state, V_new
 
 
-@tilelang.jit(out_idx=[-3, -2, -1])
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
 def tilelang_chunk_gated_delta_rule_fwd_h(
     # task config
     B,
@@ -94,15 +111,15 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
     gate_dtype,
     state_dtype,
     chunk_size,
-    use_g=True,
-    use_initial_state=True,
-    store_final_state=True,
-    save_new_value=True,
+    use_g,
+    use_initial_state,
+    store_final_state,
+    save_new_value,
     # kernel config
     block_DK=64,
-    block_DV=64,
-    threads=256,
-    num_stages=0,
+    block_DV=32,
+    threads=128,
+    num_stages=1,
 ):
     block_S = chunk_size
     BS = S // block_S
@@ -118,14 +135,14 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            U: T.Tensor(U_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=output_dtype),
-            final_state: T.Tensor(final_state_shape, dtype=state_dtype),
-            V_new: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        U: T.Tensor(U_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=output_dtype),
+        final_state: T.Tensor(final_state_shape, dtype=state_dtype),
+        V_new: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
@@ -143,35 +160,35 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
             G_shared = T.alloc_shared((block_S, block_DV), dtype=gate_dtype)
             G_fragment = T.alloc_fragment((block_S, block_DV), dtype=gate_dtype)
 
-            T.annotate_layout({
-                b_h_shared: tilelang.layout.make_swizzled_layout(b_h_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                V_new_shared: tilelang.layout.make_swizzled_layout(V_new_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                G_shared: tilelang.layout.make_swizzled_layout(G_shared),
-            })
+            T.annotate_layout(
+                {
+                    b_h_shared: tilelang.layout.make_swizzled_layout(b_h_shared),
+                    U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                    W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                    V_new_shared: tilelang.layout.make_swizzled_layout(V_new_shared),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    G_shared: tilelang.layout.make_swizzled_layout(G_shared),
+                }
+            )
 
             T.use_swizzle(10)
 
             if use_initial_state:
-                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_h_shared)
+                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_h_shared)
                 T.copy(b_h_shared, b_h_fragment)
             else:
                 T.clear(b_h_fragment)
 
             for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
                 # Store previous result to the hidden tensor, like the epilogue
-                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Recurrence
-                T.copy(W[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(W[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], W_shared)
                 T.gemm(W_shared, b_h_shared, V_new_fragment, clear_accum=True)
 
                 # U - W * S
-                T.copy(
-                    U[bb, i_s * block_S:(i_s + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                    U_shared)
+                T.copy(U[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], U_shared)
                 T.copy(U_shared, U_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     V_new_fragment[i_s2, i_v] = -V_new_fragment[i_s2, i_v] + U_fragment[i_s2, i_v]
@@ -179,11 +196,9 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
                 # Save V_new
                 if save_new_value:
                     T.copy(V_new_fragment, dst=V_new_shared)
-                    T.copy(
-                        V_new_shared, V_new[bb, i_s * block_S:(i_s + 1) * block_S, bh,
-                                            bv * block_DV:(bv + 1) * block_DV])
+                    T.copy(V_new_shared, V_new[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
-                T.copy(K[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], K_shared)
                 # use_g
                 if use_g:
                     G_last_local[0] = G[bb, (i_s + 1) * block_S - 1, bh]
@@ -193,11 +208,12 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
                         with T.If(G_last_local[0] - G_fragment[i_s2, i_v] <= 0):
                             with T.Then():
-                                V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp(
-                                    G_last_local[0] - G_fragment[i_s2, i_v])
+                                V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp2(
+                                    (G_last_local[0] - G_fragment[i_s2, i_v]) * 1.442695
+                                )
                             with T.Else():
                                 V_new_fragment[i_s2, i_v] = 0
-                    G_last_local[0] = T.exp(G_last_local[0])
+                    G_last_local[0] = T.exp2(G_last_local[0] * 1.442695)
                     for i_k, i_v in T.Parallel(DK, block_DV):
                         b_h_fragment[i_k, i_v] *= G_last_local[0]
 
@@ -209,7 +225,7 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
 
             # Save final state
             if store_final_state:
-                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -260,47 +276,77 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    h_ref, final_state_ref, V_new_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, state_dtype))
-    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                                      getattr(torch, output_dtype),
-                                                                      getattr(torch, state_dtype))
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    h_ref, final_state_ref, V_new_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
-    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(K, W, U, G, initial_state,
-                                                                     store_final_state, chunk_size,
-                                                                     save_new_value)
+    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+    )
 
     # tilelang
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+    )
     h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
     # (zhengju) If you want to print the generated cuda code, you can uncomment the following line
     # print("CUDA Code:\n", kernel.get_kernel_source())
 
-    fla_time = do_bench(chunk_gated_delta_rule_fwd_h, K, W, U, G, initial_state, store_final_state,
-                        chunk_size, save_new_value)
+    fla_time = do_bench(
+        chunk_gated_delta_rule_fwd_h,
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+    )
     tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
 
     # check correctness
     try:
         h_ref_fp32 = h_ref.to(torch.float32)
         h_tilelang_fp32 = h_tilelang.to(torch.float32)
-        assert_similar(
-            h_ref_fp32,
-            h_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd h",
-            raise_assert=False)
+        assert_similar(h_ref_fp32, h_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd h", raise_assert=False)
         print("tilelang chunk gated delta rule fwd h passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd h failed ✗")
@@ -314,7 +360,8 @@ def run_test(
             final_state_tilelang_fp32,
             eps=1e-5,
             name="tilelang chunk gated delta rule fwd final_state",
-            raise_assert=False)
+            raise_assert=False,
+        )
         print("tilelang chunk gated delta rule fwd final_state passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd final_state failed ✗")
@@ -323,12 +370,7 @@ def run_test(
     try:
         V_new_ref_fp32 = V_new_ref.to(torch.float32)
         V_new_tilelang_fp32 = V_new_tilelang.to(torch.float32)
-        assert_similar(
-            V_new_ref_fp32,
-            V_new_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd V_new",
-            raise_assert=False)
+        assert_similar(V_new_ref_fp32, V_new_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd V_new", raise_assert=False)
         print("tilelang chunk gated delta rule fwd V_new passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd V_new failed ✗")
@@ -345,20 +387,20 @@ def main():
         H=32,
         DK=128,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         use_g=True,
-        use_initial_state=True,
+        use_initial_state=False,
         store_final_state=True,
         save_new_value=True,
-        block_DK=64,
+        block_DK=32,
         block_DV=32,
         threads=128,
-        num_stages=1,
+        num_stages=2,
     )
 
 
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
index 1c084be70549aff839d87718d3e1a3bab688a76c..81536815923b294c38a02540f436670fc71798a4 100644
--- a/examples/gdn/example_chunk_o.py
+++ b/examples/gdn/example_chunk_o.py
@@ -9,6 +9,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_fwd_o
 except ImportError:
@@ -87,16 +88,14 @@ def tilelang_chunk_fwd_o(
 
     @T.prim_func
     def kernel(
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            O: T.Tensor(O_shape, dtype=output_dtype),
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        O: T.Tensor(O_shape, dtype=output_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bv, bs, bbh):
+        with T.Kernel(T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H, threads=threads) as (bv, bs, bbh):
             bb, bh = bbh // H, bbh % H
             Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
@@ -109,28 +108,24 @@ def tilelang_chunk_fwd_o(
             G_shared = T.alloc_shared((block_S,), dtype=gate_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=gate_dtype)
 
-            T.annotate_layout({
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                H_shared: tilelang.layout.make_swizzled_layout(H_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    H_shared: tilelang.layout.make_swizzled_layout(H_shared),
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                }
+            )
 
             T.clear(A_fragment)
             T.clear(O_fragment)
             T.disable_warp_group_reg_alloc()
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    Q[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    Q_shared)
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    HIDDEN[bb, bs, bh, i_k * block_DK:(i_k + 1) * block_DK,
-                           bv * block_DV:(bv + 1) * block_DV], H_shared)
+                T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], Q_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(HIDDEN[bb, bs, bh, i_k * block_DK : (i_k + 1) * block_DK, bv * block_DV : (bv + 1) * block_DV], H_shared)
                 T.gemm(Q_shared, H_shared, O_fragment)
                 T.gemm(Q_shared, K_shared, A_fragment, transpose_B=True)
 
@@ -145,8 +140,7 @@ def tilelang_chunk_fwd_o(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     with T.If(G_diff_local[i_s1, i_s2] <= 0):
                         with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
                         with T.Else():
                             A_fragment[i_s1, i_s2] = 0
 
@@ -155,8 +149,7 @@ def tilelang_chunk_fwd_o(
                     with T.Then():
                         A_fragment[i_s1, i_s2] = 0
 
-            T.copy(V[bb, bs * block_S:(bs + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                   V_shared)
+            T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], V_shared)
             T.copy(A_fragment, A_shared)
             T.gemm(A_shared, V_shared, O_fragment)
 
@@ -164,8 +157,7 @@ def tilelang_chunk_fwd_o(
                 O_fragment[i_s, i_v] = O_fragment[i_s, i_v] * scale
 
             T.copy(O_fragment, O_shared)
-            T.copy(O_shared, O[bb, bs * block_S:(bs + 1) * block_S, bh,
-                               bv * block_DV:(bv + 1) * block_DV])
+            T.copy(O_shared, O[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -191,8 +183,9 @@ def run_test(
     output_dtype_torch = getattr(torch, output_dtype)
     accum_dtype_torch = getattr(torch, accum_dtype)
     gate_dtype_torch = getattr(torch, gate_dtype)
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, input_dtype_torch,
-                                       output_dtype_torch, accum_dtype_torch, gate_dtype_torch)
+    Q, K, V, HIDDEN, G = prepare_input(
+        B, S, H, DK, DV, chunk_size, input_dtype_torch, output_dtype_torch, accum_dtype_torch, gate_dtype_torch
+    )
     scale = 1.0 / DK**0.5
 
     O_ref = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
@@ -200,9 +193,25 @@ def run_test(
 
     block_S = chunk_size
     O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)
 
     try:
@@ -221,10 +230,10 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
         use_g=True,
         block_DK=128,
         block_DV=128,
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index 7e87a2c4f86ff5ba7ecb0ed0984a3f31d5cea4fd..97e2f4f01e08f29fcb845df5af728c7bf00b2728 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -12,6 +12,7 @@ from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F4
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_bwd_dqkwg
 except ImportError:
@@ -19,7 +20,7 @@ except ImportError:
     fla = None
 
 import torch
-from utils import *
+from test_utils import assert_similar
 
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
@@ -108,10 +109,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_chunk_o_bwd_dqkwg(
     # task config
     B,
@@ -155,25 +154,23 @@ def tilelang_chunk_o_bwd_dqkwg(
 
     @T.prim_func
     def kernel(
-            # input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dh: T.Tensor(dh_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            # output
-            dq: T.Tensor(dq_shape, dtype=output_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dw: T.Tensor(dw_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dh: T.Tensor(dh_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        # output
+        dq: T.Tensor(dq_shape, dtype=output_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dw: T.Tensor(dw_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bk, bs, bbh):
+        with T.Kernel(T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H, threads=threads) as (bk, bs, bbh):
             bb, bh = bbh // H, bbh % H
 
             V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
@@ -212,15 +209,17 @@ def tilelang_chunk_o_bwd_dqkwg(
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                h_shared: tilelang.layout.make_swizzled_layout(h_shared),
-                dh_shared: tilelang.layout.make_swizzled_layout(dh_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                q_shared: tilelang.layout.make_swizzled_layout(q_shared),
-                k_shared: tilelang.layout.make_swizzled_layout(k_shared),
-            })
+            T.annotate_layout(
+                {
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                    h_shared: tilelang.layout.make_swizzled_layout(h_shared),
+                    dh_shared: tilelang.layout.make_swizzled_layout(dh_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    q_shared: tilelang.layout.make_swizzled_layout(q_shared),
+                    k_shared: tilelang.layout.make_swizzled_layout(k_shared),
+                }
+            )
 
             T.clear(dg_last_local)
             T.clear(G_last_local)
@@ -235,18 +234,10 @@ def tilelang_chunk_o_bwd_dqkwg(
             T.clear(dw_fragment)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
-                T.copy(
-                    dO[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], dO_shared)
-                T.copy(
-                    h[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                      i_v * block_DV:(i_v + 1) * block_DV], h_shared)
-                T.copy(
-                    dh[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                       i_v * block_DV:(i_v + 1) * block_DV], dh_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.copy(dO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dO_shared)
+                T.copy(h[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], h_shared)
+                T.copy(dh[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], dh_shared)
 
                 if use_g:
                     T.clear(dg_last_fragment_scalar)
@@ -254,9 +245,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                     # for i_kv in T.Parallel(block_DK * block_DV):
                     #     dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     for i_kv in T.Parallel(block_DK * block_DV):
-                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv %
-                                                          block_DV] * dh_shared[i_kv // block_DV,
-                                                                                i_kv % block_DV]
+                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     T.reduce_sum(dg_last_fragment, dg_last_fragment_scalar, dim=-1, clear=False)
                     dg_last_local[0] += dg_last_fragment_scalar[0]
 
@@ -265,22 +254,16 @@ def tilelang_chunk_o_bwd_dqkwg(
                 T.gemm(V_shared, dh_shared, dk_fragment, transpose_B=True)
 
                 if use_dw:
-                    T.copy(
-                        dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                           i_v * block_DV:(i_v + 1) * block_DV], dv_shared)
+                    T.copy(dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dv_shared)
                     T.gemm(dv_shared, h_shared, dw_fragment, transpose_B=True)
 
             if use_dw:
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dw_fragment[i_s, i_k] = -dw_fragment[i_s, i_k]
-                T.copy(
-                    dw_fragment, dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-
-            T.copy(Q[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   q_shared)
-            T.copy(K[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   k_shared)
+                T.copy(dw_fragment, dw[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+            T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], q_shared)
+            T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], k_shared)
             T.copy(q_shared, q_fragment)
             T.copy(k_shared, k_fragment)
 
@@ -294,8 +277,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                 dg_last_local[0] = dg_last_local[0] * T.exp(G[bb, bs * block_S + block_S - 1, bh])
 
                 for i_s, i_k in T.Parallel(block_S, block_DK):
-                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s,
-                                                                            bh]) * scale
+                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s, bh]) * scale
                 T.clear(dg_fragment_reduce_tmp)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dg_fragment_reduce_tmp[i_s, i_k] = dq_fragment[i_s, i_k] * q_shared[i_s, i_k]
@@ -305,8 +287,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     with T.If(G_last_local[0] - G[bb, bs * block_S + i_s, bh] <= 0):
                         with T.Then():
-                            dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp(
-                                G_last_local[0] - G[bb, bs * block_S + i_s, bh])
+                            dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp(G_last_local[0] - G[bb, bs * block_S + i_s, bh])
                         with T.Else():
                             dk_fragment[i_s, i_k] = 0
                 T.clear(dg_fragment_reduce_tmp)
@@ -325,12 +306,11 @@ def tilelang_chunk_o_bwd_dqkwg(
                 dg_last_local[1] = dg_last_fragment_scalar_2[0]
 
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 >= i_s2 and
-                              G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
+                    with T.If(i_s1 >= i_s2 and G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
                         with T.Then():
-                            ds_fragment[i_s1, i_s2] = ds_fragment[
-                                i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                    G[bb, bs * block_S + i_s2, bh]) * scale
+                            ds_fragment[i_s1, i_s2] = (
+                                ds_fragment[i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh]) * scale
+                            )
                         with T.Else():
                             ds_fragment[i_s1, i_s2] = 0
 
@@ -338,8 +318,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                 T.clear(ds_fragment_positive_transpose)
                 T.gemm(q_shared, k_shared, ds_fragment_positive, transpose_B=True)
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    ds_fragment_positive[
-                        i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
+                    ds_fragment_positive[i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
 
                 # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
                 T.reduce_sum(ds_fragment_positive, dg_fragment, dim=1, clear=False)
@@ -363,15 +342,10 @@ def tilelang_chunk_o_bwd_dqkwg(
                 for i_s in T.Parallel(block_S):
                     with T.If(i_s >= block_S - 1):  # noqa: SIM117
                         with T.Then():
-                            dg_fragment_final[
-                                i_s] = dg_fragment_final[i_s] + dg_last_local[0] + dg_last_local[1]
-
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                            dg_fragment_final[i_s] = dg_fragment_final[i_s] + dg_last_local[0] + dg_last_local[1]
+
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
                 for i_s in T.Parallel(block_S):
                     dg[bk, bb, bs * block_S + i_s, bh] = dg_fragment_final[i_s]
 
@@ -387,12 +361,8 @@ def tilelang_chunk_o_bwd_dqkwg(
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * scale
                     dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] + dk_fragment_2[i_s, i_k] * scale
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
 
     return kernel
 
@@ -442,33 +412,53 @@ def run_test(
     threads=256,
     num_stages=0,
 ):
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                    getattr(torch, output_dtype),
-                                                    getattr(torch, gate_dtype),
-                                                    getattr(torch, state_dtype), block_DK)
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype), block_DK)
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
 
     # ref
     if use_g:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
     else:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
 
     # tilelang
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
-                                        block_DK, block_DV, threads, num_stages)
-    print(kernel.get_kernel_source())
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_dw,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)
 
     if use_g:
@@ -515,11 +505,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         # scale=1,
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
index d07a4776a23f4808547f99ce4b6616cf44b93bc5..e8ef17e3f4b50479958692460d4fc785b0e82a18 100644
--- a/examples/gdn/example_chunk_scaled_dot_kkt.py
+++ b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -9,6 +9,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
 except ImportError:
@@ -56,9 +57,9 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
     H,
     DK,
     chunk_size=64,
-    input_dtype="bfloat16",
-    output_dtype="bfloat16",
-    accum_dtype="float32",
+    input_dtype=T.bfloat16,
+    output_dtype=T.bfloat16,
+    accum_dtype=T.float32,
     use_g=True,
     # kernel config
     block_S=64,
@@ -75,10 +76,10 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=accum_dtype),
-            A: T.Tensor(output_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=accum_dtype),
+        A: T.Tensor(output_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -93,10 +94,12 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
             G_shared = T.alloc_shared((block_S,), dtype=accum_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                }
+            )
 
             T.fill(A_fragment, 0)
             T.disable_warp_group_reg_alloc()
@@ -104,9 +107,7 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     Beta_K_fragment[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
                 T.gemm(Beta_K_fragment, K_shared, A_fragment, transpose_B=True)
@@ -119,8 +120,7 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     with T.If(G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2):
                         with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
                         with T.Else():
                             A_fragment[i_s1, i_s2] = 0
             else:
@@ -130,7 +130,7 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
                             A_fragment[i_s1, i_s2] = 0
 
             T.copy(A_fragment, A_shared)
-            T.copy(A_shared, A[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(A_shared, A[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
@@ -149,24 +149,21 @@ def run_test(
     threads,
     num_stages,
 ):
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     A_ref = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
     A_tilelang = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
 
     # reference
     if use_g:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
     else:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
 
     # tilelang
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)
 
     try:
@@ -186,13 +183,14 @@ def main():
         H=32,
         DK=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
         use_g=True,
         block_DK=64,
         threads=128,
-        num_stages=2)
+        num_stages=2,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
index 9896c7ecf7689bf379e371298f06822502eb34d0..0760b496458ae334d1baf54934c0526962a529a6 100644
--- a/examples/gdn/example_cumsum.py
+++ b/examples/gdn/example_cumsum.py
@@ -10,6 +10,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.utils.cumsum import chunk_local_cumsum_scalar
 except ImportError:
@@ -20,11 +21,8 @@ import torch
 
 
 @tilelang.jit(
-    out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
+)
 def tilelang_chunk_local_cumsum_scalar(
     # task config
     B,
@@ -34,43 +32,43 @@ def tilelang_chunk_local_cumsum_scalar(
     is_varlen=False,
     head_first=False,
     reverse=False,
-    input_dtype="float16",
-    output_dtype="float32",
+    input_dtype=T.float16,
+    output_dtype=T.float32,
     # kernel config
     block_S=64,
     threads=256,
     use_fragment=False,
 ):
     G_shape = (B, H, S) if head_first else (B, S, H)
-    assert chunk_size == 2**(chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
     assert chunk_size == block_S, "chunk_size must be equal to block_S"
 
     @T.prim_func
     def kernel(
-            G: T.Tensor(G_shape, dtype=input_dtype),
-            G_new: T.Tensor(G_shape, dtype=output_dtype),
+        G: T.Tensor(G_shape, dtype=input_dtype),
+        G_new: T.Tensor(G_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
             G_shared = T.alloc_shared((1, block_S), dtype=output_dtype, scope="shared")
             if head_first:
-                T.copy(G[bb, bh, bs * block_S:(bs + 1) * block_S], G_shared)
+                T.copy(G[bb, bh, bs * block_S : (bs + 1) * block_S], G_shared)
             else:
-                T.copy(G[bb, bs * block_S:(bs + 1) * block_S, bh], G_shared)
+                T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh], G_shared)
             if use_fragment:
                 G_fragment = T.alloc_fragment((1, block_S), dtype=output_dtype, scope="shared")
                 T.copy(G_shared, G_fragment)
                 T.cumsum(G_fragment, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_fragment, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_fragment, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_fragment, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_fragment, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
             else:
                 T.cumsum(G_shared, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_shared, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_shared, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_shared, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_shared, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
 
     return kernel
 
@@ -113,11 +111,8 @@ def run_test(
 
     # reference cumsum
     G_new_ref = chunk_local_cumsum_scalar(
-        g=G,
-        chunk_size=chunk_size,
-        reverse=reverse,
-        head_first=head_first,
-        output_dtype=getattr(torch, output_dtype))
+        g=G, chunk_size=chunk_size, reverse=reverse, head_first=head_first, output_dtype=getattr(torch, output_dtype)
+    )
 
     # tilelang cumsum
     block_S = chunk_size
@@ -159,10 +154,11 @@ def main():
         chunk_size=64,
         reverse=True,
         head_first=False,
-        input_dtype="float32",
-        output_dtype="float32",
+        input_dtype=T.float32,
+        output_dtype=T.float32,
         threads=256,
-        use_fragment=False)
+        use_fragment=False,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
index 0a0983a82f70f10994e1a150b484e271e5c7915a..9ac086ca76916afddbb671377b97809546528e40 100644
--- a/examples/gdn/example_wy_fast.py
+++ b/examples/gdn/example_wy_fast.py
@@ -9,6 +9,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd
 except ImportError:
@@ -73,13 +74,13 @@ def tilelang_recompute_w_u_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=output_dtype),
-            W: T.Tensor(K_shape, dtype=output_dtype),
-            U: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=output_dtype),
+        W: T.Tensor(K_shape, dtype=output_dtype),
+        U: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -95,49 +96,42 @@ def tilelang_recompute_w_u_fwd(
             W_Beta_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             U_Beta_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_Beta_shared: tilelang.layout.make_swizzled_layout(W_Beta_shared),
-                U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                    U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                    W_Beta_shared: tilelang.layout.make_swizzled_layout(W_Beta_shared),
+                    U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
+                }
+            )
 
             T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     U_Beta_shared[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
                 T.gemm(A_shared, U_Beta_shared, U_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(U_fragment, U_shared)
-                T.copy(
-                    U_shared, U[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(U_shared, U[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    W_Beta_shared[i_s,
-                                  i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
+                    W_Beta_shared[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
                 T.gemm(A_shared, W_Beta_shared, W_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(W_fragment, W_shared)
-                T.copy(
-                    W_shared, W[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(W_shared, W[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
     return kernel
 
@@ -159,15 +153,8 @@ def run_test(
     num_stages,
 ):
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     W_ref, U_ref = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
     W_tilelang, U_tilelang = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
 
@@ -191,7 +178,8 @@ def run_test(
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
@@ -217,14 +205,15 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        gate_dtype="float32",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        gate_dtype=T.float32,
+        accum_dtype=T.float32,
         block_DK=64,
         block_DV=32,
         threads=128,
-        num_stages=3)
+        num_stages=3,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
index 618a82b4c8288421b4dd91570c3edf9ac515d2de..de8afc2b7770432db2535c05c69a852b1af802da 100644
--- a/examples/gdn/example_wy_fast_bwd_split.py
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -10,6 +10,7 @@ import tilelang.language as T
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import bwd_prepare_wy_repr
 except ImportError:
@@ -93,10 +94,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-5, -4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_wy_fast_bwd(
     # task config
     B,
@@ -135,20 +134,20 @@ def tilelang_wy_fast_bwd(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            # output
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        # output
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -187,7 +186,7 @@ def tilelang_wy_fast_bwd(
             T.clear(dbeta_fragment_v)
             T.clear(dg_fragment)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -195,51 +194,37 @@ def tilelang_wy_fast_bwd(
 
             # Update dk
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    K_shared_beta_g[i_s,
-                                    i_k2] = K_shared[i_s,
-                                                     i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
-                T.copy(
-                    dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dw_shared)
+                    K_shared_beta_g[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                T.copy(dw[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dw_shared)
                 T.gemm(dw_shared, K_shared_beta_g, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, dw_shared, dk_fragment_beta_g, clear_accum=True, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dk_fragment[
-                        i_s,
-                        i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                    dk_fragment[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[
-                        i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
 
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dg_fragment[i_s] = dg_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dg_fragment_reduce_tmp[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[
-                        i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    dg_fragment_reduce_tmp[i_s, i_k2] = (
+                        dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    )
                 T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=1, clear=False)
 
                 # correct dk
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dv
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     V_shared_beta[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
-                T.copy(
-                    du[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], du_shared)
+                T.copy(du[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], du_shared)
                 T.gemm(du_shared, V_shared_beta, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, du_shared, dv_fragment_beta, clear_accum=True, transpose_A=True)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
@@ -247,30 +232,22 @@ def tilelang_wy_fast_bwd(
                 # for i_s, i_v2 in T.Parallel(block_S, block_DV):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
-                    dbeta_fragment_reduce_tmpv[i_s,
-                                               i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s,
-                                                                                              i_v2]
+                    dbeta_fragment_reduce_tmpv[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpv, dbeta_fragment_v, dim=1, clear=False)
 
-                T.copy(
-                    dv_fragment, dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(dv_fragment, dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             # Temporary store dbeta, dg and dA
             for i_s in T.Parallel(block_S):
                 dbeta[bb, bs * block_S + i_s, bh] = dbeta_fragment_k[i_s] + dbeta_fragment_v[i_s]
                 dg[bb, bs * block_S + i_s, bh] = dg_fragment[i_s]
             # correct dA
-            T.copy(dA_fragment, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(dA_fragment, dA[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
 
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_wy_fast_bwd_split(
     # task config
     B,
@@ -308,20 +285,20 @@ def tilelang_wy_fast_bwd_split(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
-            dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
+        dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -350,7 +327,7 @@ def tilelang_wy_fast_bwd_split(
             T.clear(dA_A_fragment_1)
             T.clear(dA_A_fragment_2)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -361,7 +338,7 @@ def tilelang_wy_fast_bwd_split(
             # for i_s in T.Parallel(block_S):
             # dbeta_fragment[i_s] = dbeta[bb, bs * block_S + i_s, bh]
             # dg_fragment[i_s] = dg[bb, bs * block_S + i_s, bh]
-            T.copy(dA[bb, bs * block_S:(bs + 1) * block_S, bh, :], dA_shared)
+            T.copy(dA[bb, bs * block_S : (bs + 1) * block_S, bh, :], dA_shared)
             # T.copy(dA_shared, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
 
             # Update dA
@@ -385,8 +362,7 @@ def tilelang_wy_fast_bwd_split(
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
                 with T.If(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
                     with T.Then():
-                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                         G[bb, bs * block_S + i_s2, bh])
+                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh])
                     with T.Else():
                         dA_fragment[i_s1, i_s2] = 0
             T.copy(dA_fragment, dA_shared)
@@ -397,12 +373,8 @@ def tilelang_wy_fast_bwd_split(
             # Update dk using previous dk
             T.clear(A_fragment)
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dk_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_shared)
                 T.copy(dk_shared, dk_fragment)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     K_shared_beta[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
@@ -411,18 +383,14 @@ def tilelang_wy_fast_bwd_split(
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s,
-                                               i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s,
-                                                                                              i_k2]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
                 T.gemm(dA_shared, K_shared_beta, dk_fragment, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_shared_beta[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_fragment[i_s, i_k2] = dk_fragment[i_s, i_k2] + dk_shared_beta[i_s, i_k2]
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dg and dbeta
             T.copy(A_fragment, A_shared)
@@ -460,19 +428,25 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
-    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, gate_dtype),
-                                                       getattr(torch, state_dtype))
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -480,28 +454,55 @@ def run_test(
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # ref
-    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(
-        K, V, G, Beta, A, dw, du, cu_seqlens=None)
+    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(K, V, G, Beta, A, dw, du, cu_seqlens=None)
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
+
+    from test_utils import assert_similar
 
-    from utils import assert_similar
     assert_similar(dk_ref, dk_tilelang, eps=1e-5, name="dk", raise_assert=False)
     assert_similar(dv_ref, dv_tilelang, eps=1e-5, name="dv", raise_assert=False)
     assert_similar(dbeta_ref, dbeta_tilelang, eps=1e-5, name="dbeta", raise_assert=False)
@@ -517,11 +518,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         block_DK=32,
         block_DV=32,
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
index e184dbcace82ecd309997b294b60843ff2038efa..e749fa0874ee5dbddecbabe23ce1c72f74662647 100644
--- a/examples/gdn/test_example_gdn_compilation.py
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -1,16 +1,17 @@
-import tilelang.testing
 import torch
+import tilelang.testing
+from tilelang import language as T
 
 B = 1
 S = 1024  # small but for test only.
 H = 32
 DK = 128
 DV = 128
-input_dtype = "bfloat16"
-output_dtype = "bfloat16"
-accum_dtype = "float32"
-gate_dtype = "float32"
-state_dtype = "float32"
+input_dtype = T.bfloat16
+output_dtype = T.bfloat16
+accum_dtype = T.float32
+gate_dtype = T.float32
+state_dtype = T.float32
 chunk_size = 64
 use_g = True
 use_initial_state = True
@@ -25,16 +26,10 @@ num_stages = 1
 
 def test_example_wy_fast_compilation():
     from example_wy_fast import tilelang_recompute_w_u_fwd, prepare_input
+
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     # tilelang
     block_S = chunk_size
     kernel = tilelang_recompute_w_u_fwd(
@@ -52,22 +47,31 @@ def test_example_wy_fast_compilation():
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
 
 def test_example_wy_fast_bwd_split_compilation():
     from example_wy_fast_bwd_split import tilelang_wy_fast_bwd, tilelang_wy_fast_bwd_split, prepare_input, prepare_output
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
+
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -75,67 +79,146 @@ def test_example_wy_fast_bwd_split_compilation():
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
 
 
 def test_example_chunk_o_compilation():
     from example_chunk_o import tilelang_chunk_fwd_o, prepare_input
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype),
-                                       getattr(torch, output_dtype), getattr(torch, accum_dtype),
-                                       getattr(torch, gate_dtype))
+
+    Q, K, V, HIDDEN, G = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
     scale = 1.0 / DK**0.5
     block_S = chunk_size
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)  # noqa: F841
 
 
 def test_example_chunk_o_bwd_compilation():
     from example_chunk_o_bwd import tilelang_chunk_o_bwd_dqkwg, prepare_input
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
-                                        block_DK, block_DV, threads, num_stages)
-    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
-                                                                W)  # noqa: F841
+
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        True,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)  # noqa: F841
     if use_g:
         dg_tilelang = dg_tilelang.sum(dim=0)
 
 
 def test_example_chunk_scaled_dot_kkt_compilation():
     from example_chunk_scaled_dot_kkt import tilelang_chunk_scaled_dot_kkt_fwd, prepare_input
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)  # noqa: F841
 
 
 def test_example_cumsum_compilation():
     from example_cumsum import tilelang_chunk_local_cumsum_scalar, prepare_cumsum_input, prepare_cumsum_output
+
     G = prepare_cumsum_input(B, S, H, getattr(torch, gate_dtype))
     G_new_tilelang = prepare_cumsum_output(B, S, H, getattr(torch, gate_dtype))
     block_S = chunk_size
@@ -157,33 +240,79 @@ def test_example_cumsum_compilation():
 
 def test_example_chunk_delta_h_compilation():
     from example_chunk_delta_h import tilelang_chunk_gated_delta_rule_fwd_h, prepare_input
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
-    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G,
-                                                              initial_state)  # noqa: F841
+
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)  # noqa: F841
 
 
 def test_example_chunk_delta_bwd_compilation():
     from example_chunk_delta_bwd import tilelang_chunk_gated_delta_rule_bwd_dhu, prepare_input
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, 1.0, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)  # noqa: F841
 
 
diff --git a/examples/gdn/utils.py b/examples/gdn/test_utils.py
similarity index 68%
rename from examples/gdn/utils.py
rename to examples/gdn/test_utils.py
index 37f8d8e69f2bf388bc9b95fc571817489375d32c..3588551ce39ad1bee4267b727979336d73341561 100644
--- a/examples/gdn/utils.py
+++ b/examples/gdn/test_utils.py
@@ -9,7 +9,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -19,21 +19,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if raise_assert:
             raise AssertionError
     else:
diff --git a/examples/gemm/README.md b/examples/gemm/README.md
index d7833c97dc16bf7933f6692056ee4e46140a9e30..9ab7fb6614654e4594484d2e72ba9a7703b2706b 100644
--- a/examples/gemm/README.md
+++ b/examples/gemm/README.md
@@ -53,7 +53,7 @@ import tilelang
 from tilelang import Profiler
 import tilelang.language as T
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -176,7 +176,7 @@ import tilelang.language as T
 # that helps align data for MMA (Matrix Multiply-Accumulate) operations.
 from tilelang.intrinsics import make_mma_swizzle_layout as make_swizzle_layout
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -265,18 +265,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
diff --git a/examples/gemm/example_gemm.py b/examples/gemm/example_gemm.py
index f18cd388a79e0972b12cdecd89bac6fdd780d64e..906a55d5d00fb516c00a10bacdf42c916a23bdb3 100644
--- a/examples/gemm/example_gemm.py
+++ b/examples/gemm/example_gemm.py
@@ -3,13 +3,12 @@ import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
index 661ef1276d496dbcc38fa0a77036a611fecceea8..ca322217341f5745193372751ad00a009d9f42c2 100644
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -51,9 +51,9 @@ def get_configs(M, N, K, with_roller=False, topk=20):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -90,7 +90,8 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -100,13 +101,13 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
 
 def get_best_config(M, N, K, with_roller=False):
-
     def kernel(
         block_M=None,
         block_N=None,
@@ -115,17 +116,16 @@ def get_best_config(M, N, K, with_roller=False):
         thread_num=None,
         enable_rasteration=None,
     ):
-        dtype = "bfloat16"
-        accum_dtype = "float"
+        dtype = T.bfloat16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 B_shared = T.alloc_shared((block_N, block_K), dtype)
                 C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -146,15 +146,18 @@ def get_best_config(M, N, K, with_roller=False):
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
+        )
+        .set_profile_args(
             supply_type=tl.TensorSupplyType.Integer,
             ref_prog=ref_program,
             skip_check=False,
         )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
@@ -167,52 +170,20 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tl.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           thread_num,
-           enable_rasteration,
-           dtype="float16",
-           accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_autotune(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -236,11 +207,7 @@ def matmul(M,
     return gemm_autotune
 
 
-def main(M: int = 4096,
-         N: int = 4096,
-         K: int = 4096,
-         use_autotune: bool = False,
-         with_roller: bool = False):
+def main(M: int = 4096, N: int = 4096, K: int = 4096, use_autotune: bool = False, with_roller: bool = False):
     use_autotune = True
     if use_autotune:
         result = get_best_config(M, N, K, with_roller)
@@ -266,15 +233,7 @@ if __name__ == "__main__":
     parser.add_argument("--m", type=int, default=4096, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=4096, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=4096, help="Matrix dimension K")
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=False,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=False, help="Whether to enable BitBLAS roller for search space")
     args = parser.parse_args()
     main(args.m, args.n, args.k, args.use_autotune, args.with_roller)
diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
index 5c014ce3a4b51d9b131382fa5af4773cd1b0f582..746e6ec011d8e44830431198dc03060ba4e5af91 100644
--- a/examples/gemm/example_gemm_intrinsics.py
+++ b/examples/gemm/example_gemm_intrinsics.py
@@ -4,7 +4,8 @@ import tilelang
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 
@@ -34,18 +35,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -53,7 +54,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     chunk = 32
     shared_scope = "shared.dyn"
 
@@ -99,12 +100,11 @@ def tl_matmul(
 
     @T.prim_func
     def gemm_intrinsics(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -112,10 +112,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -123,7 +125,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -133,7 +134,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
@@ -163,7 +163,7 @@ def ref_program(A, B):
 
 
 def main(M=4096, N=4096, K=4096):
-    in_dtype, out_dtype, accum_dtype = "float16", "float16", "float32"
+    in_dtype, out_dtype, accum_dtype = T.float16, T.float16, T.float32
     kernel = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     src_code = kernel.get_kernel_source()
     # src_code is the generated cuda source
diff --git a/examples/gemm/example_gemm_persistent.py b/examples/gemm/example_gemm_persistent.py
index a2a7122d39193b274b015c038e8615e3e21d17af..30f55de6a06d06eadabd9461ee0eba1169521764 100644
--- a/examples/gemm/example_gemm_persistent.py
+++ b/examples/gemm/example_gemm_persistent.py
@@ -5,22 +5,12 @@ import argparse
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_non_persistent(M,
-                          N,
-                          K,
-                          block_M,
-                          block_N,
-                          block_K,
-                          threads,
-                          num_stages,
-                          dtype="float16",
-                          accum_dtype="float"):
-
+def matmul_non_persistent(M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -43,18 +33,9 @@ def matmul_non_persistent(M,
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_persistent(M,
-                      N,
-                      K,
-                      block_M,
-                      block_N,
-                      block_K,
-                      threads,
-                      num_stages,
-                      dtype="float16",
-                      accum_dtype="float",
-                      use_persistent_primitive=True):
-
+def matmul_persistent(
+    M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32, use_persistent_primitive=True
+):
     sm_num = driver.get_num_sms()
     m_blocks = T.ceildiv(M, block_M)
     n_blocks = T.ceildiv(N, block_N)
@@ -63,9 +44,9 @@ def matmul_persistent(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -90,9 +71,9 @@ def matmul_persistent(M,
 
     @T.prim_func
     def main_persistent_primitive(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -100,8 +81,7 @@ def matmul_persistent(M,
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            for bx, by in T.Persistent(
-                [T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
+            for bx, by in T.Persistent([T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
                 T.clear(C_local)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     T.copy(A[bx * block_M, k * block_K], A_shared)
@@ -128,18 +108,15 @@ def main(M=4096, N=4096, K=4096):
     num_stages = 3
 
     persistent_kernel = matmul_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
-    persistent_profiler = persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    persistent_profiler = persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Persistent GEMM: All check passed.")
     persistent_latency = persistent_profiler.do_bench(warmup=500)
     print(f"Persistent GEMM Latency: {persistent_latency} ms")
     print(f"Persistent GEMM TFlops: {total_flops / persistent_latency * 1e-9} TFlops")
 
-    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads,
-                                                  num_stages)
-    non_persistent_profiler = non_persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
+    non_persistent_profiler = non_persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     non_persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Non-Persistent GEMM: All check passed.")
     non_persistent_latency = non_persistent_profiler.do_bench(warmup=500)
@@ -151,9 +128,9 @@ def main(M=4096, N=4096, K=4096):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--M', type=int, default=8192, help='M dimension')
-    parser.add_argument('--N', type=int, default=8192, help='N dimension')
-    parser.add_argument('--K', type=int, default=8192, help='K dimension')
+    parser.add_argument("--M", type=int, default=8192, help="M dimension")
+    parser.add_argument("--N", type=int, default=8192, help="N dimension")
+    parser.add_argument("--K", type=int, default=8192, help="K dimension")
     args = parser.parse_args()
     M, N, K = args.M, args.N, args.K
     main(M, N, K)
diff --git a/examples/gemm/example_gemm_schedule.py b/examples/gemm/example_gemm_schedule.py
index f4727412b74be1f7ef4a59bec24f40109e55e50a..8663c878d043c86a765be08ccf99ae87ce9d1bee 100644
--- a/examples/gemm/example_gemm_schedule.py
+++ b/examples/gemm/example_gemm_schedule.py
@@ -3,13 +3,12 @@ import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_schedule(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd.py b/examples/gemm_fp8/example_tilelang_gemm_amd.py
index 0e6ace7571f55eba02c13930b87808eb9dd3c9db..93f8c4980c36409e38afb1439b244918eee31748 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_amd.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd.py
@@ -17,10 +17,8 @@ def supply_prog(args):
     a_param, b_param = args
     M, K = a_param.shape
     N, _ = b_param.shape
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
     return [a, b]
 
 
@@ -35,40 +33,36 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks,
-                                                               num_stages, num_threads, k_packs,
-                                                               gemm_types):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "block_K": k,
-            "num_stages": stages,
-            "num_threads": t,
-            "k_pack": kp,
-            "gemm_type": gemm_type,
-        })
+    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks, num_stages, num_threads, k_packs, gemm_types):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "block_K": k,
+                "num_stages": stages,
+                "num_threads": t,
+                "k_pack": kp,
+                "gemm_type": gemm_type,
+            }
+        )
     return valid_configs
 
 
 @tilelang.autotune(
-    configs=get_configs(),
-    cache_input_tensors=True,
-    ref_prog=ref_program,
-    manual_check_prog=manual_check_prog,
-    supply_prog=supply_prog)
+    configs=get_configs(), cache_input_tensors=True, ref_prog=ref_program, manual_check_prog=manual_check_prog, supply_prog=supply_prog
+)
 @tilelang.jit(out_idx=[-1])
 def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pack, gemm_type):
-    dtype = "float8_e4m3fnuz"
-    accum_dtype = "float"
+    dtype = T.float8_e4m3fnuz
+    accum_dtype = T.float32
 
     @T.prim_func
     def gemm_fp8_rs(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_local = T.alloc_fragment((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -77,24 +71,17 @@ def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pa
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_local)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_local,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_local, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     @T.prim_func
     def gemm_fp8_ss(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -103,13 +90,7 @@ def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pa
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
@@ -123,10 +104,8 @@ def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pa
 
 def test_gemm_fp8(M, N, K):
     kernel = fp8_matmul(M, N, K)
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
     c = kernel(a, b)
     ref_c = ref_program(a, b)
     torch_assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
index a403ed068a5c2bea9b5d234fe1dc7c1e04c826a5..1b440a7952a211b61f20e8c7c849d474a89cb0c1 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -1,7 +1,6 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 def calc_diff(x, y):
@@ -12,13 +11,12 @@ def calc_diff(x, y):
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     @T.prim_func
     def gemm_fp8(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -37,12 +35,12 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.randn(M, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
-    b = torch.randn(N, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
+    a = torch.randn(M, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
+    b = torch.randn(N, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
@@ -57,8 +55,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 1024, T.float8_e4m3fn)
+    test_gemm_fp8(1024, 1024, 1024, T.float8_e5m2)
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
index 1d9207aff2d3d33b83dccb1c106de027afbbd058..1c5d84d72f16cd7af7e1304bfbacd28c80795035 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -1,11 +1,10 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     # for fp8 gemm, do one promote after 4 wgmma inst, i.e. block_K = 128.
     # if block_K < 128, promote after 128/block_K iters.
     # if block_K > 128, promote after every iter.
@@ -13,9 +12,9 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
 
     @T.prim_func
     def gemm_fp8_2xAcc(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -55,18 +54,18 @@ def calc_diff(x, y):
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.rand(M, K, dtype=torch.float16, device='cuda')
+    a = torch.rand(M, K, dtype=torch.float16, device="cuda")
     a = (100 * (2 * a - 1)).to(dtype=torch_dtype)
-    b = torch.rand(N, K, dtype=torch.float16, device='cuda')
+    b = torch.rand(N, K, dtype=torch.float16, device="cuda")
     b = (100 * (2 * b - 1)).to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
-    ref_c = (a.float() @ b.float().T)
+    ref_c = a.float() @ b.float().T
 
     diff = calc_diff(c, ref_c)
     print(f"diff: {diff}")
@@ -74,8 +73,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 8192, T.float8_e4m3fn)
+    test_gemm_fp8(1024, 1024, 8192, T.float8_e5m2)
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index ed44aab6950491b40ced330b88fccc8a10966d2c..7ecde7c1b4e03cf7cc6f3f0284f4062d30e25cb5 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -5,7 +5,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,21 +39,26 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -60,7 +66,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -105,12 +111,11 @@ def tl_matmul(
 
     @T.prim_func
     def gemm_fp8_intrinsic(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -118,10 +123,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -129,7 +136,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -139,7 +145,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -215,8 +220,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def main():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa7e8b360805459bda83b73dd7334b1bd923a201
--- /dev/null
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -0,0 +1,124 @@
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm_v2(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    trans_A,
+                    trans_B,
+                    mbar=mbar,
+                    wg_wait=-1,
+                    clear_accum=(k == 0),
+                )
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+M, N, K = 4096, 4096, 8192
+block_M, block_N, block_K = 64, 256, 32
+trans_A, trans_B = False, True
+num_stages = 2
+threads = 256
+for tvm_fp8_dtype in [T.float8_e4m3fn, T.float8_e5m2]:
+    for tvm_acc_dtype in [T.float16, T.float32]:  # , torch.float16]:
+        torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
+        torch_acc_dtype = map_torch_type(tvm_acc_dtype)
+        print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
+        in_dtype, out_dtype, accum_dtype = tvm_fp8_dtype, tvm_acc_dtype, tvm_acc_dtype
+
+        func = matmul(
+            M,
+            N,
+            K,
+            block_M,
+            block_N,
+            block_K,
+            trans_A,
+            trans_B,
+            in_dtype,
+            out_dtype,
+            accum_dtype,
+            num_stages,
+            threads,
+        )
+        jit_kernel = tilelang.compile(
+            func,
+            out_idx=[2],
+            target="cuda",
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+                tilelang.PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT: True,
+            },
+        )
+        # jit_kernel.export_ptx("./dump.ptx")
+        # jit_kernel.export_sources("./dump.cu")
+
+        a = torch.randn(M, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+        b = torch.randn(N, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+
+        c = jit_kernel(a, b)
+        ref_c = (a.to(torch.half) @ b.T.to(torch.half)).float()
+        c = c.float()
+        diff = calc_diff(c, ref_c)
+        # assert diff < 1e-3, f"{diff}"
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] diff = {diff}")
+
+        profiler = jit_kernel.get_profiler()
+        latency = profiler.do_bench()
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Latency: {latency} ms")
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_sm100/README.md b/examples/gemm_sm100/README.md
index 73dd76c308b46d4c14bf22a38dca7afe4e878537..28bb611bff167fdf7ba2291833edb91fd3d17beb 100644
--- a/examples/gemm_sm100/README.md
+++ b/examples/gemm_sm100/README.md
@@ -40,19 +40,19 @@ import tilelang.language as T
 
 @T.prim_func
 def main(
-    A: T.Tensor((M, K), "bfloat16"),
-    B: T.Tensor((N, K), "bfloat16"),
-    C: T.Tensor((M, N), "bfloat16"),
+    A: T.Tensor((M, K), T.bfloat16),
+    B: T.Tensor((N, K), T.bfloat16),
+    C: T.Tensor((M, N), T.bfloat16),
 ):
     with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
         # 1. Allocate memory buffers
-        A_shared = T.alloc_shared((block_M, block_K), "bfloat16")  # A matrix shared memory
-        B_shared = T.alloc_shared((block_N, block_K), "bfloat16")  # B matrix shared memory
-        C_tmem = T.alloc_tmem([block_M, block_N], "float")         # TCGEN5MMA output to Tensor Memory
+        A_shared = T.alloc_shared((block_M, block_K), T.bfloat16)  # A matrix shared memory
+        B_shared = T.alloc_shared((block_N, block_K), T.bfloat16)  # B matrix shared memory
+        C_tmem = T.alloc_tmem([block_M, block_N], T.float)         # TCGEN5MMA output to Tensor Memory
         mbar = T.alloc_barrier(1)                                 # mbarrier synchronization primitive
 
-        C_local = T.alloc_fragment((block_M, block_N), "float")   # Register storage
-        C_shared = T.alloc_shared((block_M, block_N), "bfloat16") # Output shared memory
+        C_local = T.alloc_fragment((block_M, block_N), T.float)   # Register storage
+        C_shared = T.alloc_shared((block_M, block_N), T.bfloat16) # Output shared memory
 
         # 2. Main computation loop
         for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
diff --git a/examples/gemm_sm100/gemm_mma.py b/examples/gemm_sm100/gemm_mma.py
index a58e5a7c005a0d8ff6d7d2316d10243860ede981..226e33c01e474ec646ba7f7e7ac39c86a2497c6a 100644
--- a/examples/gemm_sm100/gemm_mma.py
+++ b/examples/gemm_sm100/gemm_mma.py
@@ -4,13 +4,12 @@ import tilelang.language as T
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -62,7 +61,8 @@ jit_kernel = tilelang.compile(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 print(jit_kernel.get_kernel_source())
 # 3. Test the kernel in Python with PyTorch data
 import torch
diff --git a/examples/gemm_sm100/gemm_tcgen5mma.py b/examples/gemm_sm100/gemm_tcgen5mma.py
index 9008c7ef5208aa6ea9d0145ceac5e43b4a534cbb..523a94fea6737bcd33f879ee8b49aaecaa3740af 100644
--- a/examples/gemm_sm100/gemm_tcgen5mma.py
+++ b/examples/gemm_sm100/gemm_tcgen5mma.py
@@ -25,9 +25,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -40,15 +40,7 @@ def matmul(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_tmem,
-                    trans_A,
-                    trans_B,
-                    mbar=mbar,
-                    wg_wait=-1,
-                    clear_accum=k == 0)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
                 T.mbarrier_wait_parity(mbar, k % 2)
 
             T.copy(C_tmem, C_local)
@@ -62,12 +54,11 @@ def matmul(
 M, N, K = 4096, 4096, 8192
 block_M, block_N, block_K = 128, 256, 128
 trans_A, trans_B = False, True
-in_dtype, out_dtype, accum_dtype = "bfloat16", "bfloat16", "float"
+in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
 num_stages = 2
 threads = 256
 
-func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-              accum_dtype, num_stages, threads)
+func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 jit_kernel = tilelang.compile(
     func,
     out_idx=[2],
@@ -75,7 +66,8 @@ jit_kernel = tilelang.compile(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 
 print(jit_kernel.get_kernel_source())
 
@@ -88,4 +80,4 @@ torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
 profiler = jit_kernel.get_profiler()
 latency = profiler.do_bench()
 print(f"Latency: {latency} ms")
-print(f"Flops: {2 * M * N * K / (latency/1e3) / 1e12} TFLOPS")
+print(f"Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b93f2a779e2e56dd496712abf9fa16363feafa8
--- /dev/null
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -0,0 +1,336 @@
+import argparse
+
+import tilelang
+import tilelang.language as T
+
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.sparse import randn_semi_sparse
+from tilelang.utils.tensor import torch_assert_close
+
+from triton.testing import do_bench
+
+import torch
+
+torch.manual_seed(42)
+
+DEFAULT_CONFIG = {  # take best config from autotune script
+    "4090": {
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
+    "h20": {
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
+}
+
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
+
+@tilelang.jit(out_idx=[-1])
+def matmul_sp_fp16_custom_compress(
+    M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization, use_cutlass_layout
+):
+    e_factor, e_dtype = (16, T.int16)
+
+    @T.prim_func
+    def gemm_sp_fp16_custom_compress(
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                    }
+                )
+            T.clear(C_local)
+            T.disable_warp_group_reg_alloc()
+            T.use_swizzle(panel_size=10, enable=enable_rasterization)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_local, False, False, policy=policy)
+
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return gemm_sp_fp16_custom_compress
+
+
+def torch_compress(dense):
+    """
+    A naive compression function, where each 4-bit meta matches 4 elements in original matrix in row major layout.
+    """
+    if dense.dim() != 2:
+        raise RuntimeError(f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor")
+
+    m, k = dense.shape
+
+    meta_dtype = torch.int8
+    if dense.dtype == torch.int8:
+        meta_dtype = torch.int32
+    elif dense.dtype in [torch.half, torch.bfloat16, torch.float]:
+        meta_dtype = torch.int16
+    else:
+        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+    if quadbits_per_meta_elem not in (4, 8):
+        raise RuntimeError("Invalid number of elements per meta element calculated")
+
+    if meta_dtype == torch.int32:
+        if m % 16 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 16")
+    else:
+        if m % 32 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 32")
+    if k % (4 * quadbits_per_meta_elem) != 0:
+        raise RuntimeError(f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}")
+
+    if dense.dtype != torch.float:
+        ksparse = 4
+        dense_4 = dense.view(-1, k // ksparse, ksparse)
+        m0, m1, _m2, m3 = (dense_4 != 0).unbind(-1)
+    else:
+        ksparse = 2
+        dense_2 = dense.view(-1, k // ksparse, ksparse)
+        m0, _m2 = m1, m3 = (dense_2 != 0).unbind(-1)
+    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
+
+    # Encoding quadruples of True/False values as follows:
+    #     [True,  True,  False, False] -> 0b0100
+    #     [True,  False, True,  False] -> 0b1000
+    #     [False, True,  True,  False] -> 0b1001
+    #     [True,  False, False, True ] -> 0b1100
+    #     [False, True,  False, True ] -> 0b1101
+    #     [False, False, True,  True ] -> 0b1110
+    # Thus, lower two bits in the encoding are index of the True value
+    # at the lowest index in the quadruple, and the higher two bits in
+    # the encoding are index of the other True value in the quadruple.
+    # In case there are less than two True values, than False value or
+    # values at some index or indices are considered True for the
+    # encoding.  In case there are more than two True values, then the
+    # excess True value(s) at some indices are considered False for
+    # the encoding.  The exact encodings used for these cases are as
+    # follows:
+    #     [False, False, False, False] -> 0b1110
+    #     [False, False, False, True ] -> 0b1110
+    #     [False, False, True,  False] -> 0b1110
+    #     [False, True,  False, False] -> 0b1001
+    #     [False, True,  True,  True ] -> 0b1101
+    #     [True,  False, False, False] -> 0b1000
+    #     [True,  False, True,  True ] -> 0b1100
+    #     [True,  True,  False, True ] -> 0b0100
+    #     [True,  True,  True,  False] -> 0b0100
+    #     [True,  True,  True,  True ] -> 0b0100
+    # These particular encodings are chosen, with the help of Espresso
+    # logic minimizer software, for the purpose of minimization of
+    # corresponding Boolean functions, that translate non-zero flags
+    # into encoding bits.  Note also possible choices for the first
+    # and last of these encodings were limited only to (0b0100,
+    # 0b1110), in order to produce valid encodings for 1:2 sparsity
+    # case.
+
+    expr0 = m0 & m1
+    expr1 = ~m0 & m1
+    expr2 = ~m0 & ~m1
+    bit0 = expr1
+    bit1 = expr2
+    bit2 = expr0 | expr2 | m3
+    bit3 = expr1 | ~m1
+    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
+    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
+
+    if dense.dtype != torch.float:
+        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
+        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
+    else:
+        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)  # type: ignore[possibly-undefined]
+
+    meta_4 = idxs0 | (idxs1 << 2)
+    meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+
+    if quadbits_per_meta_elem == 4:
+        meta = meta_n[:, :, 0] | (meta_n[:, :, 1] << 4) | (meta_n[:, :, 2] << 8) | (meta_n[:, :, 3] << 12)
+    elif quadbits_per_meta_elem == 8:
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+            | (meta_n[:, :, 4] << 16)
+            | (meta_n[:, :, 5] << 20)
+            | (meta_n[:, :, 6] << 24)
+            | (meta_n[:, :, 7] << 28)
+        )
+
+    return (sparse, meta)
+
+
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4  # 4 groups per uint16
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+
+
+@tilelang.jit(
+    out_idx=[1, 2],
+    pass_configs={
+        tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+    },
+)
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                    }
+                )
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            # TODO: alloc_var seems buggy here
+            non_zero_cnt = T.alloc_local((1,), dtype=T.uint8)
+            non_zero_elt_log_idx = T.alloc_local((elem,), dtype=T.uint8)
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    non_zero_cnt[0] = 0
+                    for i in range(elem):
+                        non_zero_elt_log_idx[i] = 0
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    # TODO: use T.device_assert(non_zero_cnt <= 2) after rebasing main
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument("--use_cutlass_layout", action="store_true", help="Use cutlass layout for E tensor")
+    parser.add_argument("--use_torch_compressor", action="store_true", help="Use torch sparse for reference")
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
+    parser.add_argument("--cfg", type=str, choices=["4090"], default="4090")
+    args = parser.parse_args()
+    kernel = matmul_sp_fp16_custom_compress(
+        args.m, args.n, args.k, args.accum_dtype, **DEFAULT_CONFIG[args.cfg][args.accum_dtype], use_cutlass_layout=args.use_cutlass_layout
+    )
+
+    a = randn_semi_sparse(args.m, args.k, device="cuda", dtype=torch.half)
+    b = torch.randn(args.k, args.n, device="cuda", dtype=torch.half)
+
+    if args.use_torch_compressor:
+        assert not args.use_cutlass_layout, "torch sparse must be used with naive layout"
+        a_sparse, e = torch_compress(a)
+    else:
+        a_sparse, e = compress_kernel(args.m, args.k, 32, 32, T.float16, use_cutlass_layout=args.use_cutlass_layout)(a)
+
+    c = kernel(a_sparse, e, b)
+
+    ref_c = a @ b
+
+    assert not c.isnan().any(), "Reference result contains NaNs, please report an issue"
+    torch_assert_close(c, ref_c.to(c.dtype), rtol=1e-3, atol=1e-3)
+    print(f"Precision check passed. Max diff: {(c - ref_c).abs().max()}, Mean diff: {(c - ref_c).abs().mean()}")
+
+    latency = do_bench(lambda: kernel(a_sparse, e, b))
+    ref_latency = do_bench(lambda: a @ b)
+
+    total_flops = 2 * args.m * args.n * args.k
+    tflops = total_flops / latency / 1e9
+    ref_tflops = total_flops / ref_latency / 1e9
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 505f2b883718f12fd6c94693b41d6150216fd733..10f524adbc791867f94a8af203ce787217a13183 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -1,11 +1,9 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
 import argparse
 
 import tilelang
 import tilelang.language as T
 
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 from tilelang.utils.sparse import compress, randn_semi_sparse
 from tilelang.contrib import nvcc
 from triton.testing import do_bench
@@ -14,86 +12,79 @@ import torch
 
 arch = nvcc.get_target_compute_version()
 
-ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
-
-default_config = {  # take best config from autotune script
+DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 64,
-            'num_stages': 1,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 256,
-            'block_N': 128,
-            'block_K': 64,
-            'num_stages': 2,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
     },
     "h20": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
-    }
+    },
 }
 
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
 
 @tilelang.jit(out_idx=[-1])
-def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy,
-                   enable_rasterization):
+def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization):
     e_factor, e_dtype = ARCH_INFO[arch]
 
     @T.prim_func
     def gemm_sp_fp16(
-            A_sparse: T.Tensor((M, K // 2), 'float16'),
-            E: T.Tensor((M, K // e_factor), e_dtype),
-            B: T.Tensor((K, N), 'float16'),
-            C: T.Tensor((M, N), accum_dtype),
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K // 2), 'float16')
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
-            B_shared = T.alloc_shared((block_K, block_N), 'float16')
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
             C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
             T.disable_warp_group_reg_alloc()
             T.use_swizzle(panel_size=10, enable=enable_rasterization)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", backend="cutlass", block_k=block_K, arch=arch),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        backend="cutlass",
-                        block_k=block_K,
-                        arch=arch),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, block_k=block_K, arch=arch),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, block_k=block_K, arch=arch),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
                 T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
@@ -111,25 +102,15 @@ def main():
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
-    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], required=True)
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
+    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], default="4090")
     args = parser.parse_args()
-    kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype,
-                            **default_config[args.cfg][args.accum_dtype])
+    kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype, **DEFAULT_CONFIG[args.cfg][args.accum_dtype])
 
-    a = randn_semi_sparse(args.m, args.k, device='cuda', dtype=torch.half)
-    b = torch.randn(args.k, args.n, device='cuda', dtype=torch.half)
+    a = randn_semi_sparse(args.m, args.k, device="cuda", dtype=torch.half)
+    b = torch.randn(args.k, args.n, device="cuda", dtype=torch.half)
 
-    a_sparse, e = compress(
-        a,
-        transposed=False,
-        block_k=default_config[args.cfg][args.accum_dtype]['block_K'],
-        arch=arch)
+    a_sparse, e = compress(a, transposed=False, block_k=DEFAULT_CONFIG[args.cfg][args.accum_dtype]["block_K"], arch=arch)
     c = kernel(a_sparse, e, b)
 
     ref_c = a @ b
@@ -144,8 +125,8 @@ def main():
     total_flops = 2 * args.m * args.n * args.k
     tflops = total_flops / latency / 1e9
     ref_tflops = total_flops / ref_latency / 1e9
-    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency/1e3} s")
-    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency/1e3:} s")
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_sp/test_example_gemm_sp.py b/examples/gemm_sp/test_example_gemm_sp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe26df14497e392bd0cdc03b9a3352d4fbd2f24b
--- /dev/null
+++ b/examples/gemm_sp/test_example_gemm_sp.py
@@ -0,0 +1,16 @@
+import tilelang.testing
+
+import example_custom_compress
+import example_gemm_sp
+
+
+def test_example_custom_compress():
+    example_custom_compress.main()
+
+
+def test_example_gemm_sp():
+    example_gemm_sp.main()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk.py b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
index c9666971180615813ae3ab36513e07126b32184b..62073c5bddfa93959ad21b98c7b7cda3ffcb1e76 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
@@ -3,27 +3,16 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
index 145d622edf1e497ad1664157dc3e638ecea5aa86..83e83b5d2a7fff6eddbed10ddf63a19f8d18d425 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
@@ -3,27 +3,16 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
diff --git a/examples/gemm_streamk/example_tilelang_gemm_streamk.py b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
index 31cf40647c28872be65f588f5cd532f7f676e001..7ec1541ea4aac095dc34b69ea55bdd73d66f4db7 100644
--- a/examples/gemm_streamk/example_tilelang_gemm_streamk.py
+++ b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
@@ -39,7 +39,7 @@ total_tiles = num_block_m * num_block_n
 
 # Two-tile SK + DP
 streamk_tiles = total_tiles % streamk_programs
-if (total_tiles - streamk_tiles > streamk_programs):  # (total_tiles // total_programs > 1)
+if total_tiles - streamk_tiles > streamk_programs:  # (total_tiles // total_programs > 1)
     streamk_tiles += streamk_programs
 
 blocking_tiles = total_tiles - streamk_tiles
@@ -87,8 +87,8 @@ def tl_matmul_streamk(
         C: T.Tensor,
         C_local: T.LocalBuffer,
     ):
-        start_iter = T.alloc_fragment((1,), "int32", "local")
-        end_iter = T.alloc_fragment((1,), "int32", "local")
+        start_iter = T.alloc_fragment((1,), T.int32, "local")
+        end_iter = T.alloc_fragment((1,), T.int32, "local")
 
         start_iter[0] = pid * streamk_full_tiles + T.min(pid, streamk_partial_tiles)
         last_iter = (pid + 1) * streamk_full_tiles + T.min(pid + 1, streamk_partial_tiles)
@@ -135,7 +135,6 @@ def tl_matmul_streamk(
         C: T.Tensor,
         C_local: T.LocalBuffer,
     ):
-
         for p in T.serial(sm_patition_factor):
             tile_id = pid + streamk_tiles + p * total_sm
             pid_m = tile_id // T.ceildiv(N, block_N)
@@ -150,12 +149,11 @@ def tl_matmul_streamk(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, dtypeAB),
-            B: T.Tensor(B_shape, dtypeAB),
-            C: T.Tensor((M, N), dtypeC),
+        A: T.Tensor(A_shape, dtypeAB),
+        B: T.Tensor(B_shape, dtypeAB),
+        C: T.Tensor((M, N), dtypeC),
     ):
         with T.Kernel(streamk_programs, threads=threads) as pid:
-
             A_shared = T.alloc_shared(A_shared_shape, dtypeAB)
             B_shared = T.alloc_shared(B_shared_shape, dtypeAB)
             A_shared_full_tiles = T.alloc_shared(A_shared_shape, dtypeAB)
@@ -181,9 +179,9 @@ def main():
         BLOCK_SIZE_K,
         False,
         True,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         2,
         64,
     )
diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 4e43dcd9ad60505f7381b3ef02f5ff531e90e0f8..9dd0e4dd9f88cc4d74500e9b773cfcdb38999704 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -17,15 +17,14 @@ def naive_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N)) as bn:
             tn = T.get_thread_binding(0)  # tn = threadIdx.x
@@ -38,8 +37,7 @@ def naive_gemv(
                     A_shared[tk] = A[bk * BLOCK_K + tk]
                     B_shared[tn, tk] = B[bn * BLOCK_N + tn, bk * BLOCK_K + tk]
                 for tk in T.serial(BLOCK_K):
-                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn,
-                                                                            tk].astype(accum_dtype)
+                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn, tk].astype(accum_dtype)
             C[bn * BLOCK_N + tn] = C_reg[0]
 
     return main
@@ -51,15 +49,14 @@ def naive_splitk_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, BLOCK_K)) as bn:
             tn = T.get_thread_binding(0)
@@ -88,16 +85,16 @@ def splitk_gemv(
     BLOCK_N: int,
     BLOCK_K: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     TILE_K = T.ceildiv(BLOCK_K, reduce_threads)
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -127,8 +124,8 @@ def splitk_gemv_vectorized(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -136,9 +133,9 @@ def splitk_gemv_vectorized(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -168,8 +165,8 @@ def splitk_gemv_vectorized_tvm(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -177,9 +174,9 @@ def splitk_gemv_vectorized_tvm(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -197,9 +194,9 @@ def splitk_gemv_vectorized_tvm(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -209,7 +206,8 @@ def splitk_gemv_vectorized_tvm(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
@@ -218,10 +216,8 @@ def splitk_gemv_vectorized_tvm(
 
 def get_block_template_configs():
     iter_params = dict(
-        block_M=[2, 4, 8, 32, 64, 128],
-        block_N=[2, 4, 8, 32, 64, 128],
-        num_stages=[0, 1, 2, 3, 4],
-        threads=[32, 64, 128, 256])
+        block_M=[2, 4, 8, 32, 64, 128], block_N=[2, 4, 8, 32, 64, 128], num_stages=[0, 1, 2, 3, 4], threads=[32, 64, 128, 256]
+    )
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -237,18 +233,11 @@ def get_block_template_configs():
     },
     out_idx=[2],
 )
-def gemv_alloc_reducer(M,
-                       N,
-                       block_M=128,
-                       block_N=128,
-                       num_stages=2,
-                       threads=256,
-                       dtype: str = "float16",
-                       accum_dtype: str = "float"):
-
+def gemv_alloc_reducer(
+    M, N, block_M=128, block_N=128, num_stages=2, threads=256, dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float
+):
     @T.prim_func
-    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M,
-                                                                            dtype)):  # type: ignore
+    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M, dtype)):  # type: ignore
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as i0_m:
             o_reducer = T.alloc_reducer(block_M, accum_dtype, replication="all")
             T.clear(o_reducer)
@@ -287,17 +276,17 @@ def get_autotuned_kernel(
     BLOCK_N=None,
     reduce_threads=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
     BLOCK_K = reduce_threads * TILE_K
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -315,9 +304,9 @@ def get_autotuned_kernel(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -327,21 +316,22 @@ def get_autotuned_kernel(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
     return main
 
 
-def check_correctness_and_bench(kernel, N, K, bench_ref=True):
+def check_correctness_and_bench(kernel, N, K, do_bench=True):
     profiler = kernel.get_profiler()
     profiler.assert_allclose(lambda x, y: x @ y.T, atol=1e-2, rtol=1e-2)
-    if bench_ref:
+    if do_bench:
         latency = profiler.do_bench(lambda x, y: x @ y.T, warmup=50)
         print(f"Torch Latency: {latency} ms")
-    latency = profiler.do_bench(kernel, warmup=50)
-    print(f"TileLang Latency: {latency} ms\n")
+        latency = profiler.do_bench(kernel, warmup=50)
+        print(f"TileLang Latency: {latency} ms\n")
 
 
 def main(do_bench: bool = True):
@@ -350,16 +340,16 @@ def main(do_bench: bool = True):
     parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
     args, _ = parser.parse_known_args()
     N, K = args.n, args.k
-    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K)
-    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K)
-    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K)
+    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K, do_bench=do_bench)
+    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K, do_bench=do_bench)
 
     print("Test passed!")
 
-    if not do_bench:
+    if do_bench:
         best_result = get_autotuned_kernel(N, K)
         best_config = best_result.config
         kernel = splitk_gemv_vectorized_tvm(N, K, **best_config)
diff --git a/examples/gemv/test_example_gemv.py b/examples/gemv/test_example_gemv.py
index 3881ca7693989189a9afcd6c71e13820650d3696..323337a7a6a0f21f79ed4455d8243e3561f3847a 100644
--- a/examples/gemv/test_example_gemv.py
+++ b/examples/gemv/test_example_gemv.py
@@ -1,5 +1,3 @@
-import tilelang.testing
-
 import example_gemv
 
 
@@ -8,4 +6,4 @@ def test_example_gemv():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    test_example_gemv()
diff --git a/examples/grouped_gemm/example_grouped_gemm_bwd.py b/examples/grouped_gemm/example_grouped_gemm_bwd.py
index ac8da7e2c34eca885669417fad43da64289886e0..bb57c60731a4b495028612546def8b24324940a8 100644
--- a/examples/grouped_gemm/example_grouped_gemm_bwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_bwd.py
@@ -5,67 +5,45 @@ import tilelang
 import tilelang.language as T
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_fwd(batch_sum,
-                     batch_count,
-                     K,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_local([1], T.int32)
+            cur_batch_size = T.alloc_local([1], T.int32)
 
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
                 cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
 
             cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[cur_batch_idx[0]]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx[0], k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
@@ -76,7 +54,6 @@ def grouped_gemm_fwd(batch_sum,
 
 
 class _GroupedGEMM(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, a, b, batch_sizes):
         block_M = 64
@@ -99,15 +76,11 @@ class _GroupedGEMM(torch.autograd.Function):
         for i in range(batch_count - 1):
             batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes[i])
         for i in range(batch_count - 1):
-            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                             math.ceil((batch_sizes[i] + 1) / padding_M) *
-                                             padding_M)
+            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes[i] + 1) / padding_M) * padding_M)
         batch_offsets = torch.tensor(batch_offsets_list, device=a.device, dtype=torch.int32)
-        batch_padded_offsets = torch.tensor(
-            batch_padded_offsets_list, device=a.device, dtype=torch.int32)
+        batch_padded_offsets = torch.tensor(batch_padded_offsets_list, device=a.device, dtype=torch.int32)
 
-        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages, threads)
 
         o = kernel(a, b, batch_sizes, batch_offsets, batch_padded_offsets)
         ctx.save_for_backward(a, b, batch_sizes, batch_offsets)
@@ -135,8 +108,7 @@ class _GroupedGEMM(torch.autograd.Function):
             return x
 
         A, B, batch_sizes = [maybe_contiguous(x) for x in (A, B, batch_sizes)]
-        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K, num_stages, threads)
 
         dB = kernel(A, grad_output, batch_sizes, batch_offsets)
         return None, dB, None
@@ -172,9 +144,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i] + 1) / padding_M) *
-                                         padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i] + 1) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -187,40 +157,24 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_bwd(batch_sum,
-                     batch_count,
-                     M,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_bwd(batch_sum, batch_count, M, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, M], dtype),  # type: ignore
-            B: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, M], dtype),  # type: ignore
+        B: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count, threads=threads) as (bx, by, bz):
             A_shared = T.alloc_shared([block_K, block_M], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -228,13 +182,9 @@ def grouped_gemm_bwd(batch_sum,
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(batch_sizes[bz], block_K), num_stages=num_stages):
                 for i, j in T.Parallel(block_K, block_M):
-                    A_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i,
-                                               bx * block_M + j], 0)
+                    A_shared[i, j] = T.if_then_else(i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i, bx * block_M + j], 0)
                 for i, j in T.Parallel(block_K, block_N):
-                    B_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i,
-                                               by * block_N + j], 0)
+                    B_shared[i, j] = T.if_then_else(i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i, by * block_N + j], 0)
                 T.gemm(A_shared, B_shared, C_local, transpose_A=True)
 
             T.copy(C_local, C[bz, bx * block_M, by * block_N])
@@ -242,23 +192,12 @@ def grouped_gemm_bwd(batch_sum,
     return kernel
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
-
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, False, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, False, padding_M, device, dtype)
 
     A.requires_grad_(False)
     B.requires_grad_(True)
@@ -273,10 +212,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
     O.backward(dO, retain_graph=True)
     dB, B.grad = B.grad.clone(), None
 
-    if (
-        torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and \
-        torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2)
-    ):
+    if torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2):
         print("✅ Tilelang and Torch match")
     else:
         print("❌ Tilelang and Torch mismatch")
@@ -284,12 +220,11 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -301,14 +236,4 @@ if __name__ == "__main__":
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/grouped_gemm/example_grouped_gemm_fwd.py b/examples/grouped_gemm/example_grouped_gemm_fwd.py
index 9b58e3a21c6df034b3d5f91e8f7ec66b5afa4548..48d91605145405a94c6e697207bb63ee49bc6a66 100644
--- a/examples/grouped_gemm/example_grouped_gemm_fwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_fwd.py
@@ -18,8 +18,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
         torch.Tensor: Resulting tensor after grouped matrix multiplication.
     """
     assert a.shape[0] == sum(batch_sizes), "Sum of batch_sizes must equal the first dimension of a"
-    assert b.shape[0] == len(
-        batch_sizes), "The first dimension of b must match the length of batch_sizes"
+    assert b.shape[0] == len(batch_sizes), "The first dimension of b must match the length of batch_sizes"
 
     # Initialize output tensor
     output = torch.empty((sum(batch_sizes), b.shape[2]), device=a.device, dtype=a.dtype)
@@ -38,15 +37,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
 
 
 @tilelang.jit(out_idx=[2])
-def grouped_gemm(batch_sizes_list,
-                 K,
-                 N,
-                 block_M,
-                 block_N,
-                 block_K,
-                 num_stages=2,
-                 threads=128,
-                 dtype="float16"):
+def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
@@ -54,46 +45,39 @@ def grouped_gemm(batch_sizes_list,
     """
     batch_sum = sum(batch_sizes_list)
     batch_count = len(batch_sizes_list)
-    accum_dtype = "float32"
+    accum_dtype = T.float32
     total_m_blocks = sum((size + block_M - 1) // block_M for size in batch_sizes_list)
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
         with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_local([1], T.int32)
+            cur_batch_size = T.alloc_local([1], T.int32)
 
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
                 cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
 
             cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[cur_batch_idx[0]]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx[0], k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
@@ -111,8 +95,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -125,27 +108,16 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     batch_sum = sum(batch_sizes_list)
-    kernel = grouped_gemm(
-        tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
+    kernel = grouped_gemm(tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
     # print(kernel.get_kernel_source())
 
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
     out = kernel(A, B, batch_sizes, batch_offsets, batch_padded_offsets)
     ref_output = torch_gmm(A, B, batch_sizes, batch_offsets, trans_b)
     # print(out)
@@ -157,8 +129,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
     if profile:
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
-        latency = profiler.do_bench(
-            warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
+        latency = profiler.do_bench(warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
         print(f"Latency: {latency} ms")
         print(f"TFlops: {batch_sum * K * M * 2 / latency * 1e-9} TFlops")
 
@@ -173,12 +144,11 @@ def test_grouped_gemm():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -190,14 +160,4 @@ if __name__ == "__main__":
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/hadamard_transform/example_hadamard.py b/examples/hadamard_transform/example_hadamard.py
index 531d4689183e308b2694fe0e8b0028202799b276..65f463b71bb06c53ab579a1c0389e71b0b1c387e 100644
--- a/examples/hadamard_transform/example_hadamard.py
+++ b/examples/hadamard_transform/example_hadamard.py
@@ -17,7 +17,7 @@ def is_pow_of_2(n):
 def hadamard(b, n, dtype):
     assert is_pow_of_2(n), "n must be a power of 2"
     assert 2 <= n <= 32768, "n must be in [2, 32768]"
-    elem_size = {'float32': 4, 'float16': 2, 'bfloat16': 2}[dtype]
+    elem_size = {T.float32: 4, T.float16: 2, T.bfloat16: 2}[dtype]
 
     logN = int(math.log2(n))
     threads = [0, 1, 1, 1, 2, 4, 8, 16, 32, 32, 128, 256, 256, 256, 256, 256][logN]
@@ -40,23 +40,21 @@ def hadamard(b, n, dtype):
     # print(f'{exchange_round=}')
 
     @T.macro
-    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype),
-                  round: int):
+    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype), round: int):
         tx = T.get_thread_binding(0)
         for i in T.serial(round):
             tx_stride = 1 << i
             another_tx = tx ^ tx_stride
-            sign = (
-                tx >> i
-            ) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
+            sign = (tx >> i) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
 
             for j in T.Pipelined(thread_elem, num_stages=1):
                 buf[j] = T.tvm_warp_shuffle(
-                    0xffffffff,  # mask of all threads
+                    0xFFFFFFFF,  # mask of all threads
                     local[j],
                     another_tx % warp_size,
                     warp_size,
-                    warp_size)
+                    warp_size,
+                )
                 local[j] = T.if_then_else(sign == 0, local[j] + buf[j], buf[j] - local[j])
 
     @T.prim_func
@@ -78,10 +76,8 @@ def hadamard(b, n, dtype):
                 for j in T.serial(chunknum):
                     chunkbase = j * chunksize
                     for k in T.serial(chunksize // 2):
-                        local[chunkbase +
-                              k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
-                        local[chunkbase + k + chunksize //
-                              2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k + chunksize // 2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
 
             # 3. Hadamard inside warp, n<=512
             # In warp level, we rely on warp shuffle to exchange data inside each warp, without using shared memory
@@ -131,28 +127,27 @@ def ref_program(x: torch.Tensor):
     assert x.ndim == 2
     dim = x.shape[-1]
     assert is_pow_of_2(dim)
-    return F.linear(
-        x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
+    return F.linear(x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='Batch size')
-    parser.add_argument('--dim', type=int, default=32768, help='Dimension')
+    parser.add_argument("--batch", type=int, default=64, help="Batch size")
+    parser.add_argument("--dim", type=int, default=32768, help="Dimension")
     args = parser.parse_args()
 
     B, D = args.batch, args.dim
-    x = torch.randn((B, D), device='cuda')
-    kernel = hadamard(B, D, 'float32')
+    x = torch.randn((B, D), device="cuda")
+    kernel = hadamard(B, D, T.float32)
     y = kernel(x)
     y_ref = ref_program(x)
     torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2)
-    print('All tests passed.')
+    print("All tests passed.")
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
     latency = profiler.do_bench(warmup=100)
     print("Tile-lang: {:.2f} ms".format(latency))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..99cb977f0066b97da63368e5f550ef160bd52f1d
--- /dev/null
+++ b/examples/lazy_jit/lazyjit.en.ipynb
@@ -0,0 +1,789 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT combines the jit generation and invocation logic.\n",
+    "\n",
+    "The function signature syntax is similar to triton but with significant enhancements, most notably allowing Tensor annotations:\n",
+    "\n",
+    "For example, the code below annotates a 2D Tensor with T.Tensor[[int, int], T.float16]\n",
+    "1. Each dimension is a compile-time constant; changing it triggers recompilation\n",
+    "2. Its dtype must be T.float16\n",
+    "\n",
+    "DType can also be Any or None in addition to a concrete type\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm(\n",
+    "    A: T.Tensor[[int, int], T.float16],\n",
+    "    B: T.Tensor[[int, int], T.float16],\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "Call the Tensor directly as an argument to trigger the full jit compile-and-run workflow:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "Change the call-site arguments; if the compiler parameters differ, it recompiles:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "You can also manually call compile helpers to build a kernel\n",
+    "\n",
+    "1. `ker.compile` compiles the kernel\n",
+    "2. `ker.get_tir` retrieves the tir\n",
+    "3. `ker.par_compile` compiles in parallel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-11-25 17:29:46  [TileLang:tilelang.cache.kernel_cache:WARNING]: Found kernel in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.\n"
+     ]
+    }
+   ],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### Separate the implementation with macros"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "Next we'll implement a simple gemm in several ways. For convenience, first write a macro that captures the main gemm logic:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### Mark dynamic shapes with T.dyn\n",
+    "\n",
+    "When some dimensions are dynamic, mark them with T.dyn. T.dyn can take a string argument to name the variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_dyn_K(\n",
+    "    A: T.Tensor[[int, T.dyn[\"K\"]], T.float16],  # noqa: F821\n",
+    "    B: T.Tensor[[T.dyn[\"K\"], int], T.float16],  # noqa: F821\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c60fd346",
+   "metadata": {},
+   "source": [
+    "Inspect the lazy_jit function signature: parameters with a `$` suffix are compile-time constants that may vary, and those with `$dyn` are runtime variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c6992eb4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'A': TensorAnnot(shape=[A_shape_0$, K$dyn], strides=None, dtype=dtype('float16')),\n",
+       " 'B': TensorAnnot(shape=[K$dyn, B_shape_1$], strides=None, dtype=dtype('float16'))}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gemm_dyn_K.func.annot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### Use T.StridedTensor to annotate tensors with strides\n",
+    "\n",
+    "Annotation format: T.StridedTensor[Shape, Stride, DType]. Each Shape or Stride entry can be\n",
+    "* int: compile-time constant\n",
+    "* T.dyn: runtime value\n",
+    "\n",
+    "DType can be None or Any"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def as_contingious(A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]):\n",
+    "    M, N = A.shape\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A[::2, ::2])\n",
+    "B_ref = A[::2, ::2].contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### Annotate tensors with T.ptr\n",
+    "lazy_jit lets you declare a handle with T.ptr, but you must define its shape inside the function via T.match_buffer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: int,\n",
+    "    N: int,\n",
+    "    K: int,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16)\n",
+    "    B = T.match_buffer(B, (K, N), T.float16)\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### Use T.int32 to annotate runtime variables\n",
+    "\n",
+    "lazy_jit lets you define runtime variables with T.int32 or other types, enabling a fully dynamic gemm similar to triton"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr_dyn(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: T.int32,\n",
+    "    N: T.int32,\n",
+    "    K: T.int32,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16, strides=(K, 1))\n",
+    "    B = T.match_buffer(B, (K, N), T.float16, strides=(N, 1))\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## Compilation and parallel compilation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "lazyjit and the original jit both support parallel compilation\n",
+    "\n",
+    "To avoid wasting memory with torch.tensor placeholders, use T.Tensor to create placeholders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c6d7f05cdfff412e9a527332438f7aa2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "14836065a21b41ae8fc34e8763ae49fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7f29c0072ed0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00882f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00735f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0088890>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c01f94c0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0073fe0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0070ce0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00732f0>]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## More convenient macros"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang macros are now upgraded:\n",
+    "\n",
+    "1. Allow `T.Ref` as an annotation, similar to C++ pass-by-reference\n",
+    "2. Allow returning multiple values\n",
+    "3. Allow nesting and recursion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### Passing references with T.Ref\n",
+    "\n",
+    "The reference via T.Ref can target a var or a buffer element"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# import tilelang.language as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # Supports constant indices\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # Also supports variable indices\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### Pass as arguments\n",
+    "\n",
+    "You can pass macros as parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def element_wise(\n",
+    "    A: T.Tensor[[T.dyn], Any],\n",
+    "    fn,\n",
+    "):\n",
+    "    (N,) = A.shape\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Macro recursion\n",
+    "\n",
+    "Macro can be recursive, even if it's rarely needed, as long as the termination condition is known at compile time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macro returning multiple values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# import tilelang.language as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..601c5c5d2fe3610adfed8edd890d6a701f5c49f8
--- /dev/null
+++ b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -0,0 +1,789 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT 将 jit 生成和调用的逻辑合并到一起\n",
+    "\n",
+    "函数签名的写法与 triton 相似，但做了大量增强，最主要的增强是允许对 Tensor 的标注：\n",
+    "\n",
+    "例如，下面的代码用 T.Tensor[[int, int], T.float16] 来标注了一个二维 Tensor\n",
+    "1. 它的每个维度都是编译期常量，如果改变，会触发重新编译\n",
+    "2. 它的类型必须是 T.float16\n",
+    "\n",
+    "DType 除了写确定的外，还可以写 Any 或者 None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm(\n",
+    "    A: T.Tensor[[int, int], T.float16],\n",
+    "    B: T.Tensor[[int, int], T.float16],\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "直接将 Tensor 作为参数调用，即可触发完整的 jit 编译运行流程："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "更改调用的参数，如果编译器参数发生了变化，会触发重新编译："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "你也可以手动调用 compile 函数编译 kernel\n",
+    "\n",
+    "1. `ker.compile` 编译 kernel\n",
+    "2. `ker.get_tir` 获取 tir\n",
+    "3. `ker.par_compile` 并行编译"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-11-25 17:29:46  [TileLang:tilelang.cache.kernel_cache:WARNING]: Found kernel in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.\n"
+     ]
+    }
+   ],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### 用 macro 来分离实现"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "接下来，我们会用各种方式来实现一个简单的 gemm，为了方便，我们先写一个 macro 把 gemm 的主要逻辑写出来："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### 用 T.dyn 标记动态 Shape\n",
+    "\n",
+    "当某些维度是动态的的时候，可以用 T.dyn 来标记。T.dyn 可以接受一个字符串参数，表示变量的名字"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_dyn_K(\n",
+    "    A: T.Tensor[[int, T.dyn[\"K\"]], T.float16],  # noqa: F821\n",
+    "    B: T.Tensor[[T.dyn[\"K\"], int], T.float16],  # noqa: F821\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c60fd346",
+   "metadata": {},
+   "source": [
+    "查看 lazy_jit 的函数签名，其中带有后缀`$` 的是不确定的编译期常量，带有 `$dyn` 的是运行时的变量"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c6992eb4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'A': TensorAnnot(shape=[A_shape_0$, K$dyn], strides=None, dtype=dtype('float16')),\n",
+       " 'B': TensorAnnot(shape=[K$dyn, B_shape_1$], strides=None, dtype=dtype('float16'))}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gemm_dyn_K.func.annot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### 用 T.StridedTensor 标记带 stride 的 Tensor\n",
+    "\n",
+    "标记方法：T.StridedTensor[Shape, Stride, DType]，每个 Shape 或 Stride 可以写\n",
+    "* int: 表示编译期常量\n",
+    "* T.dyn：表示运行时常量\n",
+    "\n",
+    "DType 可以写 None 或 Any"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def as_contingious(A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]):\n",
+    "    M, N = A.shape\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A[::2, ::2])\n",
+    "B_ref = A[::2, ::2].contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### 用 T.ptr 标注 Tensor\n",
+    "lazy_jit 允许你用 T.ptr 来声明一个 handle，但必须在函数内用 T.match_buffer 给它定义 shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: int,\n",
+    "    N: int,\n",
+    "    K: int,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16)\n",
+    "    B = T.match_buffer(B, (K, N), T.float16)\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### 用 T.int32 标注运行时变量\n",
+    "\n",
+    "lazy_jit 允许你用 T.int32 或其他类型来定义运行时变量，这样，你可以写一个完全动态的 gemm，这和 triton 非常相似"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr_dyn(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: T.int32,\n",
+    "    N: T.int32,\n",
+    "    K: T.int32,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16, strides=(K, 1))\n",
+    "    B = T.match_buffer(B, (K, N), T.float16, strides=(N, 1))\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## 编译与并行编译"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "lazyjit 和原来的 jit 都支持并行编译\n",
+    "\n",
+    "为了防止 torch.tensor 白白浪费内存，可以使用 T.Tensor 来创建 placeholder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c6d7f05cdfff412e9a527332438f7aa2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "14836065a21b41ae8fc34e8763ae49fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7f29c0072ed0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00882f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00735f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0088890>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c01f94c0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0073fe0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0070ce0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00732f0>]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## 更便利的 Macro"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang 的 macro 现在已经升级：\n",
+    "\n",
+    "1. 允许用 `T.Ref` 作为 annotation，这类似与 C++ 的引用传递\n",
+    "2. 允许返回多个值\n",
+    "3. 允许嵌套，递归"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### T.Ref 传递引用\n",
+    "\n",
+    "T.Ref 传递的引用可以 var 也可以是 Buffer 的索引"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# import tilelang.language as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # 支持常量 index\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # 也支持变量 index\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### 当作参数传递\n",
+    "\n",
+    "你可以把 macro 当做参数传递"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def element_wise(\n",
+    "    A: T.Tensor[[T.dyn], Any],\n",
+    "    fn,\n",
+    "):\n",
+    "    (N,) = A.shape\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Macro 递归\n",
+    "\n",
+    "虽然不知道有没有这种需求，但 macro 是可以递归的，但要求终止条件编译期间确定"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macro 返回多个值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# import tilelang.language as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/examples/linear_attention/example_linear_attn_bwd.py b/examples/linear_attention/example_linear_attn_bwd.py
index 568bcc55f0cb9cad82e3ddf88f93784294185d8e..397ec7bdf6fe6d27a47e3d29c82d1b331ebb277d 100644
--- a/examples/linear_attention/example_linear_attn_bwd.py
+++ b/examples/linear_attention/example_linear_attn_bwd.py
@@ -13,20 +13,20 @@ from typing import Optional, Tuple
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tl_fused_chunk_bwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -37,13 +37,13 @@ def tl_fused_chunk_bwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_bwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
@@ -66,11 +66,13 @@ def tl_fused_chunk_bwd_kernel(
             dh = T.alloc_fragment([BK, BV], accum_dtype)
             dh_shared = T.alloc_shared([BK, BV], dtype)
 
-            T.annotate_layout({
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared)
-            })
+            T.annotate_layout(
+                {
+                    dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                }
+            )
             T.use_swizzle(10)
 
             T.clear(h)
@@ -78,10 +80,9 @@ def tl_fused_chunk_bwd_kernel(
 
             # Calculate dQ
             for i in T.Pipelined(0, NT):
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
-                T.copy(dO[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                       do)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 T.gemm(do, v, ds, transpose_B=True, clear_accum=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -94,29 +95,19 @@ def tl_fused_chunk_bwd_kernel(
                 for row, col in T.Parallel(chunk_size, BK):
                     dq[row, col] *= scale
                 T.copy(dq, dq_shared)
-                T.atomic_add(
-                    dQ[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK],
-                    dq_shared)
+                T.atomic_add(dQ[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dq_shared)
 
             # Calculate dK, dV (reversely)
             for i in T.Pipelined(1, NT + 1):
                 start = NT - i
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, start * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(
-                    K[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_k * BK:(i_k + 1) * BK], k)
-                T.copy(
-                    V[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_v * BV:(i_v + 1) * BV], v)
-                T.copy(
-                    dO[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], do)
+                T.copy(K[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 # Calculate dk
-                T.gemm(
-                    v, do, ds, transpose_B=True, clear_accum=True
-                )  # ds here actually means `s`, but we simply reuse the buffer `ds`
+                T.gemm(v, do, ds, transpose_B=True, clear_accum=True)  # ds here actually means `s`, but we simply reuse the buffer `ds`
                 for row, col in T.Parallel(chunk_size, chunk_size):
                     ds_shared[row, col] = T.if_then_else(row <= col, ds[row, col], 0)
                 T.gemm(ds_shared, q, dk, clear_accum=True)
@@ -134,13 +125,9 @@ def tl_fused_chunk_bwd_kernel(
                 T.gemm(q, do, dh, transpose_A=True)
 
                 T.copy(dk, dk_shared)
-                T.atomic_add(
-                    dK[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_k * BK:(i_k + 1) * BK], dk_shared)
+                T.atomic_add(dK[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dk_shared)
                 T.copy(dv, dv_shared)
-                T.atomic_add(
-                    dV[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], dv_shared)
+                T.atomic_add(dV[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], dv_shared)
 
     return fused_chunk_linear_attn_bwd
 
@@ -155,34 +142,31 @@ def tl_fused_chunk_bwd(Q, K, V, dO):
     return dQ.to(torch.float16), dK.to(torch.float16), dV.to(torch.float16)
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=1024, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    do = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    do = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q = l2norm_fwd(q)[0].requires_grad_(True)
@@ -193,30 +177,27 @@ def main(B=1, S=1024, H=16, D=128):
     o_ref, _ = ref_program(q, k, v)
     o_ref.backward(do, retain_graph=True)
 
-    assert torch.allclose(
-        dq, q.grad, atol=1e-2, rtol=1e-2), f'dq max err: {(dq - q.grad).abs().max()}'
-    assert torch.allclose(
-        dk, k.grad, atol=1e-2, rtol=1e-2), f'dk max err: {(dk - k.grad).abs().max()}'
-    assert torch.allclose(
-        dv, v.grad, atol=1e-2, rtol=1e-2), f'dv max err: {(dv - v.grad).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(dq, q.grad, atol=1e-2, rtol=1e-2), f"dq max err: {(dq - q.grad).abs().max()}"
+    assert torch.allclose(dk, k.grad, atol=1e-2, rtol=1e-2), f"dk max err: {(dk - k.grad).abs().max()}"
+    assert torch.allclose(dv, v.grad, atol=1e-2, rtol=1e-2), f"dv max err: {(dv - v.grad).abs().max()}"
+    print("Passed all tests!✅")
 
     # Benchmark
     q.grad = k.grad = v.grad = None
     o_ref, _ = fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False)
-    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_linear_attn_fwd.py b/examples/linear_attention/example_linear_attn_fwd.py
index 03900a7e649b0f2113a103147abc91aef3b1c6b1..849841e5179e3a3baccf1bb7794e425cd5fee990 100644
--- a/examples/linear_attention/example_linear_attn_fwd.py
+++ b/examples/linear_attention/example_linear_attn_fwd.py
@@ -14,20 +14,20 @@ from typing import Optional, Tuple
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 def tl_fused_chunk_fwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -38,11 +38,12 @@ def tl_fused_chunk_fwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
-            final_state: T.Tensor([B, H, DK, DV], accum_dtype)):  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        final_state: T.Tensor([B, H, DK, DV], accum_dtype),
+    ):  # type: ignore
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
@@ -65,8 +66,8 @@ def tl_fused_chunk_fwd_kernel(
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -77,12 +78,10 @@ def tl_fused_chunk_fwd_kernel(
                 T.gemm(k, v, h, transpose_A=True)
                 T.gemm(q, h_shared, o)
                 T.copy(o, o_shared)
-                T.atomic_add(
-                    O[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                    o_shared)
+                T.atomic_add(O[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], o_shared)
 
             # Output final state
-            T.copy(h, final_state[i_b, i_h, i_k * BK:(i_k + 1) * BK, i_v * BV:(i_v + 1) * BV])
+            T.copy(h, final_state[i_b, i_h, i_k * BK : (i_k + 1) * BK, i_v * BV : (i_v + 1) * BV])
 
     return fused_chunk_linear_attn_fwd
 
@@ -91,38 +90,35 @@ def tl_fused_chunk_fwd(q, k, v):
     B, S, H, D = q.shape
     kernel = tl_fused_chunk_fwd_kernel(B, S, H, D, D)
     print(kernel.get_kernel_source())
-    o = torch.zeros((B, S, H, D), device='cuda', dtype=torch.float32)
+    o = torch.zeros((B, S, H, D), device="cuda", dtype=torch.float32)
     h = kernel(q, k, v, o)
     return o, h
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=512, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q, _ = l2norm_fwd(q)
@@ -131,25 +127,23 @@ def main(B=1, S=512, H=16, D=128):
     o, h = tl_fused_chunk_fwd(q, k, v)
     o_ref, h_ref = ref_program(q, k, v)
 
-    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f'o max err: {(o - o_ref).abs().max()}'
-    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f'h max err: {(h - h_ref).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f"o max err: {(o - o_ref).abs().max()}"
+    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f"h max err: {(h - h_ref).abs().max()}"
+    print("Passed all tests!✅")
 
-    t1 = do_bench(
-        lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False),
-        backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+    t1 = do_bench(lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_mamba_chunk_scan.py b/examples/linear_attention/example_mamba_chunk_scan.py
index add49052db785a707ac4fd0ff27b2853b517eeda..1958dfb5aa95a64fd38e40f6632b787b39150c19 100644
--- a/examples/linear_attention/example_mamba_chunk_scan.py
+++ b/examples/linear_attention/example_mamba_chunk_scan.py
@@ -9,6 +9,7 @@ import itertools
 
 def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
     from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd
+
     out, _ = _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D)
     return out
 
@@ -43,14 +44,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -61,12 +63,7 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -77,40 +74,42 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
             cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
@@ -136,27 +135,32 @@ def chunk_scan_fwd(batch,
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -165,34 +169,47 @@ def chunk_scan_fwd(batch,
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -200,27 +217,40 @@ def chunk_scan_fwd(batch,
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_scan_fwd(
             batch,
             seq_len,
@@ -234,7 +264,8 @@ if __name__ == "__main__":
             block_K=64,
             block_Dstate=128,
             num_stages=2,
-            threads=128)
+            threads=128,
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_mamba_chunk_state.py b/examples/linear_attention/example_mamba_chunk_state.py
index ad3df0df81643e113a4dd09f0bea274c7c0016da..fb766d5e9c9c7d4b7e4de1f1f15f801f84b9de03 100644
--- a/examples/linear_attention/example_mamba_chunk_state.py
+++ b/examples/linear_attention/example_mamba_chunk_state.py
@@ -10,6 +10,7 @@ import itertools
 
 def chunk_state_triton(B, x, dt, dA_cumsum):
     from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_state_fwd
+
     return _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=False)
 
 
@@ -41,46 +42,33 @@ def ref_program(B, x, dt, dA_cumsum):
     x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
     B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
     decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype),
-                        dt.to(x.dtype), x)
+    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype), dt.to(x.dtype), x)
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[4])
-def chunk_state_fwd(batch,
-                    seqlen,
-                    chunk_size,
-                    ngroups,
-                    nheads,
-                    headdim,
-                    dstate,
-                    block_M=64,
-                    block_N=64,
-                    block_K=64,
-                    num_stages=2,
-                    threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_state_fwd(
+    batch, seqlen, chunk_size, ngroups, nheads, headdim, dstate, block_M=64, block_N=64, block_K=64, num_stages=2, threads=128
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
-    def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
-        (batch, seqlen, nheads, headdim), dtype), dt: T.Tensor(
-            (batch, nheads, nchunks, chunk_size), dtype), dA_cumsum: T.Tensor(
-                (batch, nheads, nchunks, chunk_size), dtype), Output: T.Tensor(
-                    (batch, nchunks, nheads, headdim, dstate), dtype)):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+    def main(
+        B: T.Tensor((batch, seqlen, ngroups, dstate), dtype),
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        Output: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),
+    ):
+        with T.Kernel(nheads, T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N), batch * nchunks, threads=threads) as (bz, bx, by):
             x_shared = T.alloc_shared((block_K, block_M), dtype)
             x_local = T.alloc_fragment((block_K, block_M), dtype)
             xt_local = T.alloc_fragment((block_M, block_K), dtype)
@@ -101,20 +89,24 @@ def chunk_state_fwd(batch,
             m_idx = bx // T.ceildiv(dstate, block_N)
             n_idx = bx % T.ceildiv(dstate, block_N)
 
-            T.annotate_layout({
-                x_shared: tilelang.layout.make_swizzled_layout(x_shared),
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared)
-            })
+            T.annotate_layout(
+                {x_shared: tilelang.layout.make_swizzled_layout(x_shared), acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared)}
+            )
 
             dA_cs_last[0] = dA_cumsum[batch_idx, bz, chunk_idx, chunk_size - 1]
             T.clear(acc_o)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, m_idx * block_M:(m_idx + 1) * block_M], x_shared)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cumsum_shared)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                    ],
+                    x_shared,
+                )
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cumsum_shared)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dA_cumsum_shared, dA_cumsum_local)
                 T.copy(dt_shared, dt_local)
                 for i in T.Parallel(block_K):
@@ -123,47 +115,50 @@ def chunk_state_fwd(batch,
                 for i, j in T.Parallel(block_M, block_K):
                     xt_local[i, j] = x_local[j, i] * scale[j]
                 T.copy(
-                    B[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz // (nheads // ngroups),
-                      n_idx * block_N:(n_idx + 1) * block_N], B_shared)
+                    B[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz // (nheads // ngroups),
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    B_shared,
+                )
                 T.gemm(xt_local, B_shared, acc_o)
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx, bz, m_idx * block_M:(m_idx + 1) * block_M,
-                       n_idx * block_N:(n_idx + 1) * block_N])
+                Output[batch_idx, chunk_idx, bz, m_idx * block_M : (m_idx + 1) * block_M, n_idx * block_N : (n_idx + 1) * block_N],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_state_fwd(
-            batch,
-            seq_len,
-            chunk_size,
-            groups,
-            heads,
-            dim,
-            dstate,
-            block_M=64,
-            block_N=128,
-            block_K=64,
-            num_stages=4,
-            threads=128)
+            batch, seq_len, chunk_size, groups, heads, dim, dstate, block_M=64, block_N=128, block_K=64, num_stages=4, threads=128
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_retention_fwd.py b/examples/linear_attention/example_retention_fwd.py
index 59445419a11a99fbb1a00fb6b03b0a1c94d4e9fd..f45e383889bd7ef2d93e1a00539e72110465ea43 100644
--- a/examples/linear_attention/example_retention_fwd.py
+++ b/examples/linear_attention/example_retention_fwd.py
@@ -13,13 +13,12 @@ def chunk_retention_fwd_kernel(
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -30,16 +29,16 @@ def chunk_retention_fwd_kernel(
 
     @T.prim_func
     def chunk_retention_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
-            log_decay = T.alloc_var('float32')
-            log_decay = T.log2(1 - T.exp2(-5. - 1. * i_h))  # Head-specific log decay
+            log_decay = T.alloc_var(T.float32)
+            log_decay = T.log2(1 - T.exp2(-5.0 - 1.0 * i_h))  # Head-specific log decay
 
             q = T.alloc_shared([chunk_size, BK], dtype)
             k = T.alloc_shared([chunk_size, BK], dtype)
@@ -56,14 +55,12 @@ def chunk_retention_fwd_kernel(
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
-                    s_shared[row,
-                             col] = T.if_then_else(row >= col, s[row, col] * T.exp2(
-                                 (row - col) * log_decay), 0)
+                    s_shared[row, col] = T.if_then_else(row >= col, s[row, col] * T.exp2((row - col) * log_decay), 0)
 
                 T.copy(h, h_shared)
                 T.gemm(q, h_shared, o, clear_accum=True)
@@ -75,9 +72,7 @@ def chunk_retention_fwd_kernel(
                     v[row, col] = v[row, col] * T.exp2((chunk_size - row - 1) * log_decay)
                 for row, col in T.Parallel(BK, BV):
                     h[row, col] = T.exp2(chunk_size * log_decay) * h[row, col]
-                T.copy(
-                    o, O[i_k, i_b, i * chunk_size:(i + 1) * chunk_size, i_h,
-                         i_v * BV:(i_v + 1) * BV])
+                T.copy(o, O[i_k, i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV])
                 T.gemm(k, v, h, transpose_A=True)
 
     return chunk_retention_fwd
@@ -89,24 +84,24 @@ def postprocess(o):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=4096, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=4096, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
     B, S, H, D = args.B, args.S, args.H, args.D
     total_flops = 2.0 * B * S * S * H * D  # causal
 
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     kernel = chunk_retention_fwd_kernel(B, S, H, D, D)
 
     t = do_bench(lambda: postprocess(kernel(q, k, v)), warmup=25, rep=100)
-    print(f'Tilelang latency: {t:.3f} ms')
-    print(f'Tilelang TFLOPs: {total_flops/t * 1e-9}')
+    print(f"Tilelang latency: {t:.3f} ms")
+    print(f"Tilelang TFLOPs: {total_flops / t * 1e-9}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index ebf8513a1b3a5ebb858adb720e47325737888b28..f96e73ae511d300e3fa3569ef6910805ea19bca6 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -15,12 +15,11 @@ from tilelang.profiler import do_bench
 
 @tilelang.jit(out_idx=[3])
 def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_size):
-
     block_M = 64
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
     shape = [batch, heads, seq_len, dim]
 
     seq_blocks = (seq_len + block_M - 1) // block_M
@@ -30,15 +29,13 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
     offset_shape = count_shape + [slash_size]
     index_shape = count_shape + [vertical_size]
 
-    vertical_size_round, slash_size_round = tilelang.next_power_of_2(
-        vertical_size), tilelang.next_power_of_2(slash_size)
+    vertical_size_round, slash_size_round = tilelang.next_power_of_2(vertical_size), tilelang.next_power_of_2(slash_size)
 
-    dtype = "float16"
-    accum_dtype = "float"
-    int_dtype = "int32"
+    dtype = T.float16
+    accum_dtype = T.float32
+    int_dtype = T.int32
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def Prefetch(
             K: T.Tensor(shape, dtype),
@@ -53,32 +50,30 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
         ):
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    K_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    K[bz, by, column_index[k + i], j], 0)
+                    K_shared[i, j] = T.if_then_else(k + i < column_count, K[bz, by, column_index[k + i], j], 0)
 
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    V_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    V[bz, by, column_index[k + i], j], 0)
+                    V_shared[i, j] = T.if_then_else(k + i < column_count, V[bz, by, column_index[k + i], j], 0)
 
             T.ptx_commit_group()
 
         @T.macro
         def Compute(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                k: T.int32,
-                column_count: T.int32,
-                Q_shared: T.SharedBuffer([block_M, dim], dtype),
-                K_shared: T.SharedBuffer([block_N, dim], dtype),
-                V_shared: T.SharedBuffer([block_N, dim], dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-                count: T.int32,
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            k: T.int32,
+            column_count: T.int32,
+            Q_shared: T.SharedBuffer([block_M, dim], dtype),
+            K_shared: T.SharedBuffer([block_N, dim], dtype),
+            V_shared: T.SharedBuffer([block_N, dim], dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
+            count: T.int32,
         ):
             T.ptx_wait_group(count)
             for i, j in T.Parallel(block_M, block_N):
@@ -87,6 +82,8 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
             T.copy(scores_max, scores_max_prev)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
@@ -106,17 +103,16 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
         @T.prim_func
         def vs_sparse_flashattn_ws(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                Output: T.Tensor(shape, dtype),
-                BlockCount: T.Tensor(count_shape, int_dtype),
-                BlockOffset: T.Tensor(offset_shape, int_dtype),
-                ColumnCount: T.Tensor(count_shape, int_dtype),
-                ColumnIndex: T.Tensor(index_shape, int_dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            Output: T.Tensor(shape, dtype),
+            BlockCount: T.Tensor(count_shape, int_dtype),
+            BlockOffset: T.Tensor(offset_shape, int_dtype),
+            ColumnCount: T.Tensor(count_shape, int_dtype),
+            ColumnIndex: T.Tensor(index_shape, int_dtype),
         ):
             with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bc, by, bz):
-
                 bx = T.ceildiv(seq_len, block_M) - 1 - bc
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([2, block_N, dim], dtype)
@@ -141,9 +137,11 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
                 T.create_list_of_mbarrier([128] * 9)
 
-                T.annotate_layout({
-                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                })
+                T.annotate_layout(
+                    {
+                        O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    }
+                )
 
                 block_count[0] = BlockCount[bz, by, bx]
                 column_count[0] = ColumnCount[bz, by, bx]
@@ -160,15 +158,15 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
                 if tid >= 128:
                     T.annotate_producer_reg_dealloc()
-                    T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                    T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                     T.mbarrier_arrive(mbarrier=8)
                     for bi in T.serial(block_count[0]):
                         k = block_offset[bi]
                         T.mbarrier_wait_parity(mbarrier=bi % 2 + 4, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(K[bz, by, k:k + block_N, :], K_shared[bi % 2, :, :])
+                        T.copy(K[bz, by, k : k + block_N, :], K_shared[bi % 2, :, :])
                         T.mbarrier_arrive(mbarrier=bi % 2)
                         T.mbarrier_wait_parity(mbarrier=bi % 2 + 6, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(V[bz, by, k:k + block_N, :], V_shared[bi % 2, :, :])
+                        T.copy(V[bz, by, k : k + block_N, :], V_shared[bi % 2, :, :])
                         T.mbarrier_arrive(mbarrier=bi % 2 + 2)
                 else:
                     T.annotate_consumer_reg_alloc()
@@ -179,37 +177,28 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
                     for bi in T.serial(block_count[0]):
                         k = block_offset[bi]
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0, -T.infinity(acc_s.dtype))
 
                         T.mbarrier_wait_parity(mbarrier=bi % 2, parity=((bi & 3) >> 1))
-                        T.gemm(
-                            Q_shared,
-                            K_shared[bi % 2, :, :],
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared[bi % 2, :, :], acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         T.mbarrier_arrive(mbarrier=bi % 2 + 4)
 
                         T.copy(scores_max, scores_max_prev)
 
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
                         for i in T.Parallel(block_M):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, dim):
                             acc_o[i, j] = acc_o[i, j] * scores_scale[i]
 
                         T.copy(acc_s, acc_s_cast)
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=(((bi & 3) >> 1)))
-                        T.gemm(
-                            acc_s_cast,
-                            V_shared[bi % 2, :, :],
-                            acc_o,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=((bi & 3) >> 1))
+                        T.gemm(acc_s_cast, V_shared[bi % 2, :, :], acc_o, policy=T.GemmWarpPolicy.FullRow)
 
                         T.mbarrier_arrive(mbarrier=bi % 2 + 6)
 
@@ -219,38 +208,85 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
                             logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
                     if column_count[0] != 0:
-                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], 0, bz,
-                                 by)
+                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], 0, bz, by)
                         for bi in T.serial(T.ceildiv(column_count[0], block_N) - 1):
                             k = bi * block_N
                             if bi % 2 == 0:
-                                Prefetch(K, V, K_shared_2, V_shared_2, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_1, V_shared_1,
-                                        scores_scale, scores_sum, logsum, 1)
+                                Prefetch(K, V, K_shared_2, V_shared_2, column_index, column_count[0], k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count[0],
+                                    Q_shared,
+                                    K_shared_1,
+                                    V_shared_1,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
                             else:
-                                Prefetch(K, V, K_shared_1, V_shared_1, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_2, V_shared_2,
-                                        scores_scale, scores_sum, logsum, 1)
+                                Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count[0],
+                                    Q_shared,
+                                    K_shared_2,
+                                    V_shared_2,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
                         if T.ceildiv(column_count[0], block_N) % 2 == 0:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_2, V_shared_2, scores_scale,
-                                    scores_sum, logsum, 0)
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count[0], block_N) * block_N - block_N,
+                                column_count[0],
+                                Q_shared,
+                                K_shared_2,
+                                V_shared_2,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                         else:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_1, V_shared_1, scores_scale,
-                                    scores_sum, logsum, 0)
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count[0], block_N) * block_N - block_N,
+                                column_count[0],
+                                Q_shared,
+                                K_shared_1,
+                                V_shared_1,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                     for i, j in T.Parallel(block_M, dim):
                         acc_o[i, j] /= logsum[i]
                     T.copy(acc_o, O_shared)
-                    T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                    T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return vs_sparse_flashattn_ws
 
@@ -466,11 +502,8 @@ def vertical_slash_sparse_attention(
     import os
 
     current_dir = os.path.dirname(os.path.abspath(__file__))
-    sources = [
-        os.path.join(current_dir, 'ops', 'kernels.cpp'),
-        os.path.join(current_dir, 'ops', 'vertical_slash_index.cu')
-    ]
-    ops = load(name='convert', sources=sources, verbose=False)
+    sources = [os.path.join(current_dir, "ops", "kernels.cpp"), os.path.join(current_dir, "ops", "vertical_slash_index.cu")]
+    ops = load(name="convert", sources=sources, verbose=False)
     convert_vertical_slash_indexes = ops.convert_vertical_slash_indexes
     batch_size, num_heads, context_size, head_dim = query.shape
     pad = (block_size_M - context_size) & (block_size_M - 1)
@@ -481,15 +514,13 @@ def vertical_slash_sparse_attention(
     value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
 
     if head_dim not in [16, 32, 64, 128, 256, 512]:
-        target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
         query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
         key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
         value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
 
-    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=False)[0]
-    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=True)[0]
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
 
     seqlens = torch.tensor([context_size] * query.shape[0], dtype=torch.int32, device=query.device)
     sm_scale = head_dim**-0.5
@@ -502,8 +533,7 @@ def vertical_slash_sparse_attention(
         block_size_N,
     )
 
-    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim,
-                                        v_idx.shape[2], s_idx.shape[2])
+    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim, v_idx.shape[2], s_idx.shape[2])
 
     def run(is_triton: bool = True):
         if is_triton:
@@ -521,8 +551,7 @@ def vertical_slash_sparse_attention(
                 block_size_N,
             )
         else:
-            out = tl_kernel(query, key, value, block_count, block_offset, column_count,
-                            column_index)
+            out = tl_kernel(query, key, value, block_count, block_offset, column_count, column_index)
         return out[..., :context_size, :head_dim]
 
     return run
@@ -532,8 +561,7 @@ def sum_all_diagonal_matrix(mat: torch.tensor):
     b, h, n, m = mat.shape
     zero_mat = torch.zeros((b, h, n, n)).to(mat.device)  # Zero matrix used for padding
     mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)  # pads the matrix on left and right
-    mat_strided = mat_padded.as_strided(
-        (1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
+    mat_strided = mat_padded.as_strided((1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
     sum_diags = torch.sum(mat_strided, 2)  # Sums the resulting matrix's columns
     return sum_diags[:, :, 1:]
 
@@ -555,24 +583,23 @@ def main(argv=None):
     vertical_size, slash_size = args.vertical_size, args.slash_size
 
     torch.manual_seed(0)
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     q_len = SEQ_LEN
 
     vertical_size, slash_size = min(q_len, vertical_size), min(q_len, slash_size)
     last_q = 64
-    qk = torch.einsum('bhmk, bhnk -> bhmn', q[:, :, -last_q:, :], k)
+    qk = torch.einsum("bhmk, bhnk -> bhmn", q[:, :, -last_q:, :], k)
     arange = torch.arange(last_q, device="cuda")
-    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :],
-                                        qk[:, :, :, -last_q:], -torch.inf)
+    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :], qk[:, :, :, -last_q:], -torch.inf)
     qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
     vertical = qk.sum(-2, keepdim=True)
     vertical[..., :30] = torch.inf
     vertical_topk = torch.topk(vertical, vertical_size, -1).indices
 
-    slash = sum_all_diagonal_matrix(qk)[..., :-last_q + 1]
+    slash = sum_all_diagonal_matrix(qk)[..., : -last_q + 1]
     slash[..., -30:] = torch.inf
 
     slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
diff --git a/examples/norm/rms_norm.py b/examples/norm/rms_norm.py
index 40d367c2d6708ce7e93d2f0db5f66fb2a38e7819..57bccc1a0f901ddb2ac84b0b0e2ac8f92c95480a 100644
--- a/examples/norm/rms_norm.py
+++ b/examples/norm/rms_norm.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -35,7 +35,7 @@ def rms_norm_splitk(M, N, blk_m, blk_k):
 
 @tilelang.jit(out_idx=[-1], pass_configs={"tl.disable_tma_lower": True})
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -45,7 +45,7 @@ def rms_norm(M, N, blk_m):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
@@ -54,7 +54,7 @@ def rms_norm(M, N, blk_m):
                 A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/norm/test_rms_norm.py b/examples/norm/test_rms_norm.py
index a05f9b082a8111d7b77b9351456d5b8760323178..53db03d98ce32ee034a1d36b9ee590c0992267ce 100644
--- a/examples/norm/test_rms_norm.py
+++ b/examples/norm/test_rms_norm.py
@@ -5,7 +5,7 @@ import tilelang.language as T
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -35,7 +35,7 @@ def rms_norm_splitk(M, N, blk_m, blk_k):
 
 
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -45,7 +45,7 @@ def rms_norm(M, N, blk_m):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
@@ -54,7 +54,7 @@ def rms_norm(M, N, blk_m):
                 A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/online_softmax/online_softmax.py b/examples/online_softmax/online_softmax.py
index 432482d063f271fd70411ef7ab72452211598688..811870e441b3f297c711a04179bde5cce925f126 100644
--- a/examples/online_softmax/online_softmax.py
+++ b/examples/online_softmax/online_softmax.py
@@ -9,19 +9,19 @@ from typing import Callable
 def softmax_kernel(
     M,
     N,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ) -> "Callable":
     BN = min(tl.next_power_of_2(N), 8192)
     NN = tl.cdiv(N, BN)
 
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     scale = 1.44269504  # log2(e)
 
     @T.prim_func
     def main(
-            X: T.Tensor([M, N], dtype),
-            Y: T.Tensor([M, N], dtype),
+        X: T.Tensor([M, N], dtype),
+        Y: T.Tensor([M, N], dtype),
     ):
         with T.Kernel(M, threads=128) as (i_m):
             x = T.alloc_fragment([BN], dtype)
@@ -33,7 +33,7 @@ def softmax_kernel(
             T.fill(lse, -T.infinity(accum_dtype))
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 T.reduce_max(x, max_x, dim=0, clear=True)
 
@@ -45,12 +45,12 @@ def softmax_kernel(
                 lse[0] = max_x[0] * scale + T.log2(T.exp2(lse[0] - max_x[0] * scale) + sum_exp_x[0])
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 for j in T.Parallel(BN):
                     y[j] = T.exp2(x[j] * scale - lse[0])
 
-                T.copy(y, Y[i_m, i_n * BN:(i_n + 1) * BN])
+                T.copy(y, Y[i_m, i_n * BN : (i_n + 1) * BN])
 
     return main
 
@@ -69,4 +69,4 @@ t1 = do_bench(lambda: X.softmax(dim=1), warmup=25, rep=100)
 t2 = do_bench(lambda: kernel(X), warmup=25, rep=100)
 print(f"torch latency: {t1:.3f} ms")
 print(f"TileLang latency: {t2:.3f} ms")
-print(f"Speedup: {t1/t2:.3f}x")
+print(f"Speedup: {t1 / t2:.3f}x")
diff --git a/examples/plot_layout/README.md b/examples/plot_layout/README.md
index a65d771c20a79c28f4e3329c2201843780cb5d9e..8204e93d804edde4a1d9bbb00366f7c7be39dae1 100644
--- a/examples/plot_layout/README.md
+++ b/examples/plot_layout/README.md
@@ -10,7 +10,7 @@ from typing import Literal, Callable
 from tilelang.intrinsics.utils import get_mma_micro_size
 from tilelang.tools import plot_layout
 
-def make_mma_load_base_layout(dtype: str = "float16",
+def make_mma_load_base_layout(dtype: str = T.float16,
                               matrix: Literal["A", "B"] = "A",
                               transposed: bool = False) -> T.Fragment:
     """
@@ -69,7 +69,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
     micro_size_s, _, micro_size_r = get_mma_micro_size(dtype)
 
     transform_func = transform_func
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -94,7 +94,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
 
 
 # Create a 16×16 matrix layout for ldmatrix operations
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 
 # Print the layout structure (optional for debugging)
 print(base_layout)
diff --git a/examples/plot_layout/fragment_mfma_load_a.py b/examples/plot_layout/fragment_mfma_load_a.py
index 2c3b282a6ae8ac5b937d6bfeaefc0d3911e043db..d45cc227bc2d0fcef5f1d034c0ed51f62f4c571e 100644
--- a/examples/plot_layout/fragment_mfma_load_a.py
+++ b/examples/plot_layout/fragment_mfma_load_a.py
@@ -11,10 +11,9 @@ from tilelang.intrinsics.mfma_layout import (
 )
 
 
-def make_mfma_load_base_layout(dtype: str = "float16",
-                               matrix: Literal["A", "B"] = "A",
-                               k_dim: int = 16,
-                               transposed: bool = False) -> T.Fragment:
+def make_mfma_load_base_layout(
+    dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", k_dim: int = 16, transposed: bool = False
+) -> T.Fragment:
     """
     Create a layout function for storing MFMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mfma_store_layout` to
@@ -72,17 +71,15 @@ def make_mfma_load_base_layout(dtype: str = "float16",
     # so the b matrix expected a transposed basic layout
     transform_func: Callable = None
     if matrix == "A":
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         micro_size_s, micro_size_r = micro_size_x, micro_size_k
     elif matrix == "B":
-        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-            j, i)
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         micro_size_s, micro_size_r = micro_size_k, micro_size_y
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
 
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -115,19 +112,16 @@ chunk = 2
 from tilelang.tools import plot_layout
 
 # ldmatrix layout 16x16
-base_layout = make_mfma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mfma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
 # warp layout 32x32
-warp_layout = base_layout.repeat([warp_rows, warp_cols],
-                                 repeat_on_thread=False,
-                                 lower_dim_first=False)
+warp_layout = base_layout.repeat([warp_rows, warp_cols], repeat_on_thread=False, lower_dim_first=False)
 print(warp_layout)
 plot_layout(warp_layout, name="warp_layout")
 
 # block layout 64x32
-block_layout = warp_layout.repeat([block_rows, 1], repeat_on_thread=True,
-                                  lower_dim_first=True).replicate(block_cols)
+block_layout = warp_layout.repeat([block_rows, 1], repeat_on_thread=True, lower_dim_first=True).replicate(block_cols)
 print(block_layout)
 plot_layout(block_layout, name="block_layout")
diff --git a/examples/plot_layout/fragment_mma_load_a.py b/examples/plot_layout/fragment_mma_load_a.py
index 988899448365892002536e950103fc1504d73cdc..df4a0b88701192c44e9743360ad7ead14d4f0dbd 100644
--- a/examples/plot_layout/fragment_mma_load_a.py
+++ b/examples/plot_layout/fragment_mma_load_a.py
@@ -5,9 +5,7 @@ from tvm.tir import IndexMap
 from tilelang.intrinsics.utils import get_mma_micro_size
 
 
-def make_mma_load_base_layout(dtype: str = "float16",
-                              matrix: Literal["A", "B"] = "A",
-                              transposed: bool = False) -> T.Fragment:
+def make_mma_load_base_layout(dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", transposed: bool = False) -> T.Fragment:
     """
     Create a layout function for storing MMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -36,6 +34,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
         shared_16x16_to_mma_32x8_layout_sr_b,
         shared_16x32_to_mma_32x16_layout_sr_b,
     )
+
     assert matrix in ["A", "B"], "matrix should be either A or B"
     dtype_bits = DataType(dtype).bits
     # s represents spatial axis
@@ -67,17 +66,15 @@ def make_mma_load_base_layout(dtype: str = "float16",
     # so the b matrix expected a transposed basic layout
     transform_func: Callable = None
     if matrix == "A":
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         micro_size_s, micro_size_r = micro_size_x, micro_size_k
     elif matrix == "B":
-        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-            j, i)
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         micro_size_s, micro_size_r = micro_size_k, micro_size_y
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
 
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -110,7 +107,7 @@ chunk = 2
 from tilelang.tools import plot_layout
 
 # ldmatrix layout 16x16
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
diff --git a/examples/quickstart.py b/examples/quickstart.py
index 42514ee39ebbcf3fb7f7975c8c034fe027ed86ae..e99fc0dbceff115a0569495b563764170f05fa89 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -6,13 +6,12 @@ import tilelang.language as T
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -55,10 +54,9 @@ block_M = 128
 block_N = 128
 block_K = 32
 
-# 1. Define the kernel (matmul) and compile/lower it into an executable module
+# Define the kernel (matmul) and compile/lower it into an executable module
 matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
-
-# 3. Test the kernel in Python with PyTorch data
+# Test the kernel in Python with PyTorch data
 import torch
 
 # Create random input tensors on the GPU
@@ -78,7 +76,7 @@ torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
 print("Kernel output matches PyTorch reference.")
 
 # 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
+# cuda_source = matmul_relu_kernel.get_kernel_source()
 # print("Generated CUDA kernel:\n", cuda_source)
 
 # 5.Profile latency with kernel
diff --git a/examples/rand/rand_uint.py b/examples/rand/rand_uint.py
new file mode 100644
index 0000000000000000000000000000000000000000..466a51b7a312643c66d02c7069db021f6f3cb036
--- /dev/null
+++ b/examples/rand/rand_uint.py
@@ -0,0 +1,57 @@
+import tilelang
+import tilelang.language as T
+import torch
+import triton
+import triton.language as tl
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+
+    return rand_kernel
+
+
+@triton.jit
+def triton_rand_1d(X, M, elements_per_thread, seed):
+    pid = tl.program_id(0)
+    offset = pid * elements_per_thread + tl.arange(0, elements_per_thread)
+
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+
+    base_idx = offset * 4
+    tl.store(X + base_idx, r0, mask=base_idx < M)
+    tl.store(X + base_idx + 1, r1, mask=(base_idx + 1) < M)
+    tl.store(X + base_idx + 2, r2, mask=(base_idx + 2) < M)
+    tl.store(X + base_idx + 3, r3, mask=(base_idx + 3) < M)
+
+
+def test_rand_1d(M, seed):
+    kernel = tilelang_rand_1d(M, seed)
+    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(tilelang_result)
+
+    triton_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    grid = (triton.cdiv(M, 128),)
+    triton_rand_1d[grid](triton_result, tl.constexpr(M), tl.constexpr(128 // 4), seed)
+
+    torch.testing.assert_close(tilelang_result, triton_result)
+
+
+if __name__ == "__main__":
+    test_rand_1d(1024, 42)
+    test_rand_1d(512, 123)
+    test_rand_1d(128, 0)
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index dcd581c6b9f098a2e42e4046209c37c610bd7992..25741f97cce73d6e8d9c06c9928590db5a53d68c 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,38 +27,41 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "int8"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.int8
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -81,19 +81,19 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(q_shape, dtype),
-                K: T.Tensor(kv_shape, dtype),
-                V: T.Tensor(kv_shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(q_shape, dtype),
+            Q: T.Tensor(q_shape, dtype),
+            K: T.Tensor(kv_shape, dtype),
+            V: T.Tensor(kv_shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(q_shape, dtype),
         ):
             with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -110,7 +110,7 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -122,33 +122,25 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
-                        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                         if is_causal:
                             past_len = seq_kv - seq_q
                             for i, j in T.Parallel(block_M, block_N):
-                                acc_s[i, j] = T.if_then_else(
-                                    bx * block_M + i + past_len >= k * block_N + j, 0,
-                                    -T.infinity(acc_s.dtype))
+                                acc_s[i, j] = T.if_then_else(bx * block_M + i + past_len >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                         else:
                             T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
-                        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
 
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -163,44 +155,40 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.float16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.float16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
     # Run tilelang kernel
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
 
     # Verify accuracy
-    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "TileLang output doesn't match reference"
+    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), "TileLang output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -213,42 +201,40 @@ def test_topk_sparse_attention_qlen_lt_klen():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.float16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.float16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
     print(kernel.get_kernel_source())
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
diff --git a/examples/seer_attention/block_sparse_attn_triton.py b/examples/seer_attention/block_sparse_attn_triton.py
index ed33cc1e2a4055bad1954318c51a163b3609e921..b4cc3cd00c854cdde0757af38dcf6302976cd49f 100644
--- a/examples/seer_attention/block_sparse_attn_triton.py
+++ b/examples/seer_attention/block_sparse_attn_triton.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -54,7 +51,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -69,7 +65,7 @@ def _fwd_kernel_inner(
         qk *= sm_scale
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
-        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float('-inf'))
+        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -149,7 +145,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -185,24 +181,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -247,7 +231,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,9 +254,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -281,9 +264,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -295,22 +276,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -322,16 +302,15 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -340,26 +319,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
index 59c79c283b957ab344cc725182f87cfee27af8ca..14339ff02932819d4273acc818e8da0256354dcc 100644
--- a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
+++ b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
@@ -1,7 +1,8 @@
 import torch
 import tilelang
 from tilelang.utils.sparse import compress_sm90
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang import language as T
 import tilelang.testing
 
 
@@ -24,32 +25,24 @@ def matmul_sp(
     A_shared_shape = (block_M, block_K // 2)
     B_shared_shape = (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // 8), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // 8), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // 8), 'uint8')
+            E_shared = T.alloc_shared((block_M, block_K // 8), "uint8")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                }
+            )
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // 8], E_shared)
@@ -61,7 +54,7 @@ def matmul_sp(
     return main
 
 
-def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device='cpu'):
+def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device="cpu"):
     if shape[-1] % 4 != 0:
         raise ValueError("Last dimension must be divisible by 4 for 2:4 sparsity.")
 
@@ -106,9 +99,9 @@ def run_gemm_sp(
         num_threads,
     )
 
-    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device='cuda')
+    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device="cuda")
     A_sparse, E = compress_sm90(A, block_k=block_K, transposed=False)
-    B = torch.randn((K, N), device='cuda', dtype=torch.float16)
+    B = torch.randn((K, N), device="cuda", dtype=torch.float16)
 
     C_sp = kernel(A_sparse, E, B).half()
     C = torch.matmul(A, B)
@@ -117,7 +110,7 @@ def run_gemm_sp(
 
 
 def main():
-    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 128, 128, 128, 2, 128)
+    run_gemm_sp(512, 1024, 768, T.float16, T.float16, T.float32, 128, 128, 128, 2, 128)
 
 
 if __name__ == "__main__":
diff --git a/examples/topk/example_topk.py b/examples/topk/example_topk.py
index 0ca19fb18d7b14fc69eaed7cc6721f8c3925645e..d4f0c8bfb28a116ca418633db3e0450d75bbf55e 100644
--- a/examples/topk/example_topk.py
+++ b/examples/topk/example_topk.py
@@ -22,19 +22,19 @@ def tl_topk(
     blk_m,
     threads=128,
 ):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
     def topk_kernel(
-            logits: T.Tensor([M, N], dtype),
-            topk_gates: T.Tensor([M, topk], dtype),
-            topk_indices: T.Tensor([M, topk], "int32"),
+        logits: T.Tensor([M, N], dtype),
+        topk_gates: T.Tensor([M, topk], dtype),
+        topk_indices: T.Tensor([M, topk], T.int32),
     ):
         with T.Kernel(T.ceildiv(M, blk_m), threads=threads) as bx:
             logits_frag = T.alloc_fragment([blk_m, N], dtype=dtype)
             max_val = T.alloc_fragment([blk_m], dtype=dtype)
-            expand_max_idx = T.alloc_fragment([blk_m, N], "int32")
-            max_idx = T.alloc_fragment([blk_m], "int32")
+            expand_max_idx = T.alloc_fragment([blk_m, N], T.int32)
+            max_idx = T.alloc_fragment([blk_m], T.int32)
 
             T.copy(logits[bx * blk_m, 0], logits_frag)
 
@@ -43,15 +43,12 @@ def tl_topk(
                 T.reduce_max(logits_frag, max_val, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j,
-                                                          expand_max_idx[i, j])
+                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j, expand_max_idx[i, j])
 
                 T.reduce_max(expand_max_idx, max_idx, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-
-                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0,
-                                                       logits_frag[i, j])
+                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0, logits_frag[i, j])
 
                 for i in T.Parallel(blk_m):
                     topk_gates[bx * blk_m + i, k] = max_val[i]
@@ -61,7 +58,6 @@ def tl_topk(
 
 
 def ref_program(logits, top_k):
-
     top_k_gates, top_k_indices = logits.topk(top_k, dim=1)
 
     return top_k_gates, top_k_indices.to(torch.int32)
diff --git a/examples/visual_layout_inference/visual_layout_inference.py b/examples/visual_layout_inference/visual_layout_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fa1eaf854ec283042b2f3a1c7d2c9d1ae1dd457
--- /dev/null
+++ b/examples/visual_layout_inference/visual_layout_inference.py
@@ -0,0 +1,61 @@
+import tilelang
+import tilelang.language as T
+
+
+# use pass_configs to enable layout visualization
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE: True,
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "svg",
+    },
+)
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+def main():
+    kernel = matmul(128, 128, 128, 32, 32, 32)
+
+    import torch
+
+    a = torch.randn(128, 128).cuda().half()
+    b = torch.randn(128, 128).cuda().half()
+
+    c = kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All check passed.")
+
+    # print the layout visualization result and save figures to ./tmp.
+    """
+    C_local inferenced layout:
+    Shape: [32, 32] -> [8]
+    Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+    Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+    """
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
index 4a8f41ee4fbf846f5e655529729fc01b3e1951f9..6dcd51aa7c9b885895f19aebe5bbc50f4687f14d 100644
--- a/examples/warp_specialize/example_warp_specialize_flashmla.py
+++ b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -9,9 +9,9 @@ import argparse
 
 @tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
@@ -19,11 +19,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             # smem_sQ
@@ -81,10 +81,12 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             cur_kv_head = hid // (kv_group_num // block_H)
 
-            T.annotate_layout({
-                O_shared_l: tilelang.layout.make_swizzled_layout(O_shared_l),
-                O_shared_r: tilelang.layout.make_swizzled_layout(O_shared_r),
-            })
+            T.annotate_layout(
+                {
+                    O_shared_l: tilelang.layout.make_swizzled_layout(O_shared_l),
+                    O_shared_r: tilelang.layout.make_swizzled_layout(O_shared_r),
+                }
+            )
 
             # barriers_Q
             q_shared_ready_barrier = T.alloc_barrier(arrive_count=256)
@@ -108,9 +110,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.barrier_arrive(q_shared_ready_barrier)
             T.barrier_wait(q_shared_ready_barrier, 0)
 
@@ -123,25 +125,18 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.fill(acc_o_l, 0)
                 T.fill(logsum_0, 0)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                 T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                 T.barrier_arrive(kv_shared_1_r_is_ready)
 
-                T.copy(K_pe[bid, block_N:2 * block_N, cur_kv_head, :], K_pe_shared_1)
+                T.copy(K_pe[bid, block_N : 2 * block_N, cur_kv_head, :], K_pe_shared_1)
                 T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     T.barrier_wait(kv_shared_0_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_0_l,
-                        acc_s_0,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True, clear_accum=True, wg_wait=-1)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
 
@@ -161,8 +156,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     for i, j in T.Parallel(block_H, block_N):
                         acc_s_0[i, j] = T.exp2(acc_s_0[i, j] * scale - scores_max[i] * scale)
                     for i in T.Parallel(block_H):
-                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale - scores_max[i] * scale)
 
                     T.reduce_sum(acc_s_0, scores_sum_0, dim=1)
 
@@ -182,9 +176,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.barrier_wait(scale_1_ready_barrier, k % 2)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_0_l)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :h_dim], KV_shared_0_l)
                         T.barrier_arrive(kv_shared_0_l_is_ready)
 
                     # Step 11.
@@ -204,15 +196,10 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.gemm(SP1_shared, KV_shared_1_l, acc_o_l)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_1_l)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                         T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head, :],
-                            K_pe_shared_1)
+                        T.copy(K_pe[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :], K_pe_shared_1)
                         T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 T.copy(logsum_0, logsum)
@@ -221,8 +208,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_l[i, j] /= logsum[i]
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid,
-                                          hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim])
 
             else:
                 T.copy(Q_pe_shared, Q_pe_local_1)
@@ -237,16 +223,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     # Step 2.
                     T.barrier_wait(kv_shared_1_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_1_l,
-                        acc_s_1,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True, clear_accum=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
@@ -265,8 +244,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.copy(scores_max_1, scores_max)
 
                     for i in T.Parallel(block_H):
-                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale - scores_max[i] * scale)
 
                     # Step 8.
                     for i, j in T.Parallel(block_H, block_N):
@@ -279,8 +257,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         acc_o_r[i, j] = acc_o_r[i, j] * (scores_scale_0[i] * scores_scale_1[i])
 
                     for i in T.Parallel(block_H):
-                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[
-                            i] + scores_sum_1[i]
+                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[i] + scores_sum_1[i]
 
                     T.barrier_arrive(scale_1_ready_barrier)
 
@@ -291,9 +268,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.barrier_arrive(s_shared_ready_barrier)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_1_r)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                         T.barrier_arrive(kv_shared_1_r_is_ready)
 
                     T.barrier_wait(p0_1_1_ready_barrier, k % 2)
@@ -301,15 +276,10 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.gemm(SP0_shared, KV_shared_0_r, acc_o_r)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_0_r)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, h_dim:], KV_shared_0_r)
                         T.barrier_arrive(kv_shared_0_r_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head, :],
-                            K_pe_shared_0)
+                        T.copy(K_pe[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :], K_pe_shared_0)
                         T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 T.barrier_wait(lse_0_ready_barrier, 0)
@@ -319,18 +289,17 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_r[i, j] /= logsum[i]
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          h_dim:])
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:])
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -352,31 +321,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -399,12 +361,12 @@ def main(batch=1, heads=64, kv_heads=1, kv_ctx=1024, dim=512, pe_dim=64):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
index b738a4b9c6fb814f7f2a2a3b058f93b5f83eb93f..4a2aa00d929d0d91895cc98ea3668955a8883e8f 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
@@ -7,8 +7,7 @@ tilelang.disable_cache()
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 2
     mbarrier_list = [128, 128] * num_stages
 
@@ -32,19 +31,13 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
             for ko in range(T.ceildiv(K, block_K)):
                 with T.ws(1):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages + num_stages,
-                        parity=((ko // num_stages) % num_stages) ^ 1)
-                    T.copy(A[by * block_M:(by + 1) * block_M, ko * block_K:(ko + 1) * block_K],
-                           A_shared[ko % num_stages, :, :])
-                    T.copy(B[ko * block_K:(ko + 1) * block_K, bx * block_N:(bx + 1) * block_N],
-                           B_shared[ko % num_stages, :, :])
+                    T.mbarrier_wait_parity(mbarrier=ko % num_stages + num_stages, parity=((ko // num_stages) % num_stages) ^ 1)
+                    T.copy(A[by * block_M : (by + 1) * block_M, ko * block_K : (ko + 1) * block_K], A_shared[ko % num_stages, :, :])
+                    T.copy(B[ko * block_K : (ko + 1) * block_K, bx * block_N : (bx + 1) * block_N], B_shared[ko % num_stages, :, :])
                     T.mbarrier_arrive(mbarrier=ko % num_stages)
                 with T.ws(0):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
-                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :],
-                           C_local)
+                    T.mbarrier_wait_parity(mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
+                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :], C_local)
                     T.mbarrier_arrive(mbarrier=ko % num_stages + num_stages)
 
             with T.ws(0):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
index 9ba9f6816048cbbafeb275842cb197828732d24f..7b22784323ba327d0054a0f46e53bd7e6eb6acdc 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
@@ -5,20 +5,12 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_0_gemm_1(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
index faaf48c6485fce0c1e04cf4e38de1deb2a244877..02d88019c7e1793c824e088f54d8cd3b3d871212 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
@@ -5,20 +5,12 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
index c91274540f308d5b58badde74b14d0178d31df04..5468aa6eace4ca259e885db9bd33d9e6a77b459a 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
@@ -5,26 +5,20 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    })
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+    },
+)
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     warp_group_num = 2
     threads = 128 * warp_group_num
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
index 3b1d867198b673916b2b67cd0e9a7508d6ad369c..31d156f327a6ccde2c6d1ac236b4622e60048dfe 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
@@ -5,8 +5,7 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor[(M, K), dtype],
diff --git a/format.sh b/format.sh
index e820b5886eb1d7f8182b734a3d05f0bb24578081..3cc4390dbe2a3b33e928c1c52f546dbf2cdc21bf 100755
--- a/format.sh
+++ b/format.sh
@@ -9,7 +9,7 @@
 #    bash format.sh --all
 #
 #
-# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# Ruff (format) + Clang formatter (if installed). This script formats all changed files from the last mergebase.
 # You are encouraged to run this locally before pushing changes for review.
 
 # Cause the script to exit if a single command fails
diff --git a/maint/gemm_v2/correctness_evaluation.py b/maint/gemm_v2/correctness_evaluation.py
index 33a58129600848439c627552fcbc8212bbdbd30d..44441cdeb7978a2d441c08df6ee843970b3ab840 100644
--- a/maint/gemm_v2/correctness_evaluation.py
+++ b/maint/gemm_v2/correctness_evaluation.py
@@ -2,6 +2,8 @@
 import pytest
 from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
+import torch
 
 
 def matmul(
@@ -24,13 +26,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -66,20 +66,19 @@ def _compile_and_check(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
-        import torch
-
         if trans_A:
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -147,13 +146,11 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -234,13 +231,11 @@ def matmul_sr(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -322,13 +317,11 @@ def matmul_rr(
     A_frag_shape = A_shared_shape
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -394,37 +387,48 @@ M_VALUES = [64, 128, 256]
 N_VALUES = [16, 32, 64, 128, 256, 512]
 K_VALUES = [16, 32, 64, 128]
 K_VALUES_8Bit = [32, 64, 128]
-FALSE_TRUE_CASES = ([
-    pytest.param(
-        k,
-        "float16",
-        "float16",
-        "float16",
-        id=f"K{k}-float16-float16-float16",
-    ) for k in K_VALUES
-] + [pytest.param(
-    k,
-    "int8",
-    "int32",
-    "int32",
-    id="K32-int8-int32-int32",
-) for k in K_VALUES_8Bit] + [
-    pytest.param(
-        k,
-        "float8_e5m2",
-        "float32",
-        "float32",
-        id="K32-float8_e5m2-float32-float32",
-    ) for k in K_VALUES_8Bit
-] + [
-    pytest.param(
-        k,
-        "float8_e4m3",
-        "float32",
-        "float32",
-        id="K32-float8_e4m3-float32-float32",
-    ) for k in K_VALUES_8Bit
-])
+FALSE_TRUE_CASES = (
+    [
+        pytest.param(
+            k,
+            T.float16,
+            T.float16,
+            T.float16,
+            id=f"K{k}-float16-float16-float16",
+        )
+        for k in K_VALUES
+    ]
+    + [
+        pytest.param(
+            k,
+            T.int8,
+            T.int32,
+            T.int32,
+            id="K32-int8-int32-int32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+    + [
+        pytest.param(
+            k,
+            T.float8_e5m2,
+            T.float32,
+            T.float32,
+            id="K32-float8_e5m2-float32-float32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+    + [
+        pytest.param(
+            k,
+            T.float8_e4m3fn,
+            T.float32,
+            T.float32,
+            id="K32-float8_e4m3-float32-float32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+)
 
 
 def _ensure_torch_dtypes(*dtype_names):
@@ -440,15 +444,15 @@ def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_rs_false_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rs_true_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rs(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rs_true_true(m, n, k):
-    run_gemm_rs(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
+    run_gemm_rs(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
@@ -456,15 +460,15 @@ def run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_sr_false_false(m, n, k):
-    run_gemm_sr(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_sr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_sr_true_false(m, n, k):
-    run_gemm_sr(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_sr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_sr_true_true(m, n, k):
-    run_gemm_sr(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
+    run_gemm_sr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
@@ -472,15 +476,15 @@ def run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_rr_false_false(m, n, k):
-    run_gemm_rr(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rr_true_false(m, n, k):
-    run_gemm_rr(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rr_true_true(m, n, k):
-    run_gemm_rr(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
+    run_gemm_rr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
 
 
 TRANS_CASES = [
@@ -536,9 +540,9 @@ def test_gemm_false_false(m, n, k):
         k * 3,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -555,9 +559,9 @@ def test_gemm_true_false(m, n, k):
         k * 3,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -574,9 +578,9 @@ def test_gemm_true_true(m, n, k):
         k * 3,
         True,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -595,7 +599,7 @@ def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_false_false(m, n, k)
 
 
@@ -603,7 +607,7 @@ def test_gemm_rs_false_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_true_false(m, n, k)
 
 
@@ -611,7 +615,7 @@ def test_gemm_rs_true_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_true_true(m, n, k)
 
 
@@ -627,7 +631,7 @@ def test_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_sr_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_sr_false_false(m, n, k)
 
 
@@ -635,7 +639,7 @@ def test_gemm_sr_false_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_sr_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_sr_true_false(m, n, k)
 
 
@@ -643,7 +647,7 @@ def test_gemm_sr_true_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_sr_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_sr_true_true(m, n, k)
 
 
@@ -659,7 +663,7 @@ def test_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rr_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rr_false_false(m, n, k)
 
 
@@ -667,7 +671,7 @@ def test_gemm_rr_false_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rr_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rr_true_false(m, n, k)
 
 
@@ -675,7 +679,7 @@ def test_gemm_rr_true_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rr_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rr_true_true(m, n, k)
 
 
@@ -687,7 +691,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -695,7 +699,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False False =============================")
-    #             run_gemm(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -703,7 +707,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} True False =============================")
-    #             run_gemm(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m}, {n} {k} Pass")
     #         print(f"Test {n} Pass")
 
@@ -712,7 +716,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} True True =============================")
-    #             run_gemm(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m}, {n} {k} Pass")
     #         print(f"Test {n} Pass")
 
@@ -721,15 +725,15 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm_rs(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm_rs(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # for n in [16, 32, 64, 128]:
     #     for k in [16, 32, 64, 128]:
-    #         run_gemm_rs(64, n, k, False, False, "float16", "float16", "float16", 64, n, k, 0, 256)
+    #         run_gemm_rs(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
     #         print(f"Test {64} {n} {k} Pass")
 
     # for n in [16, 32, 64, 128]:
     #     for k in [16, 32, 64, 128]:
-    #         run_gemm(64, n, k, False, False, "float16", "float16", "float16", 64, n, k, 0, 256)
+    #         run_gemm(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
     #         print(f"Test {64} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_sm70.py b/maint/gemm_v2/correctness_evaluation_sm70.py
index 128f4abce56aa4f9e76270702eec3c43ee181210..606d10261100e617cdd3360823d41aa16261004e 100644
--- a/maint/gemm_v2/correctness_evaluation_sm70.py
+++ b/maint/gemm_v2/correctness_evaluation_sm70.py
@@ -2,6 +2,7 @@
 import pytest
 from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
 
 
 def matmul(
@@ -24,13 +25,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -67,7 +66,8 @@ def _compile_and_check(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
@@ -80,7 +80,7 @@ def _compile_and_check(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -146,13 +146,11 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -213,23 +211,25 @@ def run_gemm_rs(
 M_VALUES = [64, 128]
 N_VALUES = [32, 64, 128]
 K_VALUES = [16, 32, 64]
-FALSE_TRUE_CASES = ([
+FALSE_TRUE_CASES = [
     pytest.param(
         k,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         id=f"K{k}-float16-float16-float16",
-    ) for k in K_VALUES
+    )
+    for k in K_VALUES
 ] + [
     pytest.param(
         k,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         id=f"K{k}-float16-float16-float32",
-    ) for k in K_VALUES
-])
+    )
+    for k in K_VALUES
+]
 
 
 def _ensure_torch_dtypes(*dtype_names):
@@ -245,7 +245,7 @@ def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_rs_false_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
 
 
 TRANS_CASES = [
@@ -303,9 +303,9 @@ def test_gemm_false_false(m, n, k):
         k * 3,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -326,7 +326,7 @@ def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_false_false(m, n, k)
 
 
@@ -338,7 +338,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -346,5 +346,5 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64]:
     #             print(f"======================= Test {m} {n} {k} False False =============================")
-    #             run_gemm(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_tcgen05.py b/maint/gemm_v2/correctness_evaluation_tcgen05.py
index f5d7658903d548517f9c189f68940a8e08ab3d90..8d9728182b2606b6398e470639c1e049c0bd7ec6 100644
--- a/maint/gemm_v2/correctness_evaluation_tcgen05.py
+++ b/maint/gemm_v2/correctness_evaluation_tcgen05.py
@@ -27,9 +27,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -42,15 +42,7 @@ def matmul(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_tmem,
-                    trans_A,
-                    trans_B,
-                    mbar=mbar,
-                    wg_wait=-1,
-                    clear_accum=k == 0)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
                 T.mbarrier_wait_parity(mbar, k % 2)
 
             T.copy(C_tmem, C_local)
@@ -74,7 +66,8 @@ def _compile_and_check(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
@@ -87,7 +80,7 @@ def _compile_and_check(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -138,23 +131,25 @@ M_VALUES = [32, 64, 128, 256]
 N_VALUES = [64, 128, 256, 512]
 K_VALUES = [16, 32, 64, 128]
 K_VALUES_8Bit = [32, 64, 128]
-FALSE_TRUE_CASES = ([
+FALSE_TRUE_CASES = [
     pytest.param(
         k,
-        "float16",
-        "float32",
-        "float32",
+        T.float16,
+        T.float32,
+        T.float32,
         id=f"K{k}-float16-float-float",
-    ) for k in K_VALUES
+    )
+    for k in K_VALUES
 ] + [
     pytest.param(
         k,
-        "float8_e5m2",
-        "float32",
-        "float32",
+        T.float8_e5m2,
+        T.float32,
+        T.float32,
         id="K32-float8_e5m2-float32-float32",
-    ) for k in K_VALUES_8Bit
-])
+    )
+    for k in K_VALUES_8Bit
+]
 
 TRANS_CASES = [
     pytest.param(False, True, id="nt"),
@@ -191,7 +186,7 @@ def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
+    tilelang.testing.main()
 
     # # Test Pass
     # for m in [32, 64, 128, 256]:
@@ -200,27 +195,24 @@ if __name__ == "__main__":
     #             if m in [32, 64] and (n not in [64, 128, 256]):
     #                 continue
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float", "float", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
     # for m in [32, 64, 128, 256]:
-    #     for n in [16, 32, 64, 128]:
-    #         for k in [32, 64, 128]:
+    #     for n in [32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
     #             if m in [32, 64] and (n not in [64, 128, 256]):
     #                 continue
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float8_e5m2", "float", "float", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 256)
     #             print(f"Test {m} {n} {k} Pass")
 
-    tilelang.disable_cache()
-    run_gemm(32, 512, 16, False, True, "float16", "float32", "float32", 32, 512, 16, 0, 128)
-    run_gemm(32, 512, 32, False, True, "float16", "float32", "float32", 32, 512, 32, 0, 128)
-    run_gemm(32, 512, 64, False, True, "float16", "float32", "float32", 32, 512, 64, 0, 128)
-    run_gemm(64, 512, 16, False, True, "float16", "float32", "float32", 64, 512, 16, 0, 128)
-    run_gemm(64, 512, 16, False, True, "float16", "float32", "float32", 32, 512, 16, 0, 128)
-    run_gemm(128, 512, 16, False, True, "float16", "float32", "float32", 128, 512, 16, 0, 128)
-
-    # run_gemm(64, 512, 32, False, True, "float16", "float32", "float32", 64, 512, 32, 0, 128)
-    # run_gemm(64, 512, 64, False, True, "float16", "float32", "float32", 64, 512, 64, 0, 128)
-    # run_gemm(128, 512, 16, False, True, "float16", "float32", "float32", 128, 512, 16, 0, 128)
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float8_e5m2, T.float, T.float, m, n, k, 2, 128)
diff --git a/maint/gemm_v2/latency.py b/maint/gemm_v2/latency.py
index 07a502017da13161c8dc98a2fe7973f0f979c1ef..b7b2a2af95971456aec80ebab4117fd7cff81edd 100644
--- a/maint/gemm_v2/latency.py
+++ b/maint/gemm_v2/latency.py
@@ -13,13 +13,12 @@ use_v2 = args.use_v2
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/maint/gemm_v2/latency_gemm.py b/maint/gemm_v2/latency_gemm.py
index 13392dec7f2d1510f6696c027fbd9ab554d1a7e4..5f0450e0230e18c053b9114642a7931b616926ba 100644
--- a/maint/gemm_v2/latency_gemm.py
+++ b/maint/gemm_v2/latency_gemm.py
@@ -13,13 +13,12 @@ use_v2 = args.use_v2
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/maint/gemm_v2/latency_mha_fwd_bhsd.py b/maint/gemm_v2/latency_mha_fwd_bhsd.py
index 4126bb9d3c84e2141309a8d0f3efea533a9515c9..7a83d7cec8ad840e09dc2ae5c6da6ef156760fb7 100644
--- a/maint/gemm_v2/latency_mha_fwd_bhsd.py
+++ b/maint/gemm_v2/latency_mha_fwd_bhsd.py
@@ -8,13 +8,13 @@ import argparse
 from functools import partial
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--batch', type=int, default=128, help='batch size')
-parser.add_argument('--heads', type=int, default=16, help='heads')
-parser.add_argument('--seq_q', type=int, default=1024, help='query sequence length')
-parser.add_argument('--seq_kv', type=int, default=1024, help='key/value sequence length')
-parser.add_argument('--dim', type=int, default=256, help='dim')
-parser.add_argument('--is_causal', action='store_true', help='causal')
-parser.add_argument('--tune', action='store_true', help='tune configs')
+parser.add_argument("--batch", type=int, default=128, help="batch size")
+parser.add_argument("--heads", type=int, default=16, help="heads")
+parser.add_argument("--seq_q", type=int, default=1024, help="query sequence length")
+parser.add_argument("--seq_kv", type=int, default=1024, help="key/value sequence length")
+parser.add_argument("--dim", type=int, default=256, help="dim")
+parser.add_argument("--is_causal", action="store_true", help="causal")
+parser.add_argument("--tune", action="store_true", help="tune configs")
 parser.add_argument("--use_v2", action="store_true")
 
 args = parser.parse_args()
@@ -29,24 +29,17 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -62,7 +55,7 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
                 q_idx = bx * block_M + i + past_len
@@ -85,7 +78,7 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         # T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
         if use_v2:
             T.gemm_v2(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
@@ -94,13 +87,13 @@ def flashattn(batch,
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -125,18 +118,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -152,43 +145,42 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -206,18 +198,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=64,
-            block_N=64,
-            num_stages=0,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128)
         print(kernel.get_kernel_source())
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
diff --git a/maint/host_checks/01_num_args_mismatch.py b/maint/host_checks/01_num_args_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9528652eea985b6c1a57c80897329cd3c1eafef4
--- /dev/null
+++ b/maint/host_checks/01_num_args_mismatch.py
@@ -0,0 +1,22 @@
+"""Reproduce: Argument count mismatch.
+
+Note: The adapter-level wrapper expects only inputs (A, B) because C is marked as output.
+Calling with the wrong number of inputs raises a ValueError before host entry.
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    # Missing b
+    # Expected: ValueError with message about expected vs. actual inputs
+    fn(a)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/02_pointer_type_error.py b/maint/host_checks/02_pointer_type_error.py
new file mode 100644
index 0000000000000000000000000000000000000000..188a4f8cc02254a47665f82b538a0210fa51d768
--- /dev/null
+++ b/maint/host_checks/02_pointer_type_error.py
@@ -0,0 +1,23 @@
+"""Reproduce: Pointer-type argument expected but scalar provided.
+
+We pass an integer for A; wrapper forwards it to the host where a pointer is expected.
+Expected: error like "Expect buffer A_handle to be pointer or tensor" (exact name depends on kernel param).
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # Wrong type for A (int instead of tensor)
+    a = 1
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/03_ndim_mismatch.py b/maint/host_checks/03_ndim_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..76637e8deda8a52165209c026661e45a2cf6da75
--- /dev/null
+++ b/maint/host_checks/03_ndim_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: ndim (rank) mismatch for A."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A has rank 3 instead of 2
+    a = torch.empty((M, K, 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/04_dtype_mismatch.py b/maint/host_checks/04_dtype_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3554c1d6ace76971b1b787d8febfb7ce286dc09
--- /dev/null
+++ b/maint/host_checks/04_dtype_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: dtype mismatch for A (float32 vs expected float16)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+    print(fn.get_host_source())
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float32)  # should be float16
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/05_shape_mismatch.py b/maint/host_checks/05_shape_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a48248176501d66c2a46207da7357d22b8519f7c
--- /dev/null
+++ b/maint/host_checks/05_shape_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: shape constant/symbol mismatch on A."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A's second dimension is wrong (K+1 instead of K)
+    a = torch.empty((M, K + 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/06_strides_mismatch.py b/maint/host_checks/06_strides_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e523cd64ee2f9e2c762f8813901db4b50938a74
--- /dev/null
+++ b/maint/host_checks/06_strides_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: strides check failure (non-contiguous A via transpose)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    a_nc = a.t()  # non-contiguous after transpose
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a_nc, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/07_device_type_mismatch.py b/maint/host_checks/07_device_type_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..af8e5efd5dfcf9514de445aec8eb8bedd676f1f0
--- /dev/null
+++ b/maint/host_checks/07_device_type_mismatch.py
@@ -0,0 +1,18 @@
+"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cpu", dtype=torch.float16)
+    b = torch.empty((K, N), device="cpu", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/08_device_id_mismatch.py b/maint/host_checks/08_device_id_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..280aca1570d9200aaca9f21260c6270c760e8c5b
--- /dev/null
+++ b/maint/host_checks/08_device_id_mismatch.py
@@ -0,0 +1,25 @@
+"""Reproduce: device_id mismatch (requires >=2 CUDA devices)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    if torch.cuda.device_count() < 2:
+        print("[SKIP] Need at least 2 CUDA devices to reproduce device_id mismatch.")
+        return
+
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda:0", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda:1", dtype=torch.float16)
+    # Output device is derived by the adapter; mismatch occurs in host checks
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/09_null_data_pointer.py b/maint/host_checks/09_null_data_pointer.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f5de1aff04438b3f24113c06979c0bd1c61dd5
--- /dev/null
+++ b/maint/host_checks/09_null_data_pointer.py
@@ -0,0 +1,26 @@
+"""Reproduce: NULL data pointer (advanced).
+
+Passing None for a tensor argument will be forwarded through the adapter. Depending on
+FFI handling, this commonly triggers a pointer-type assertion (e.g., "Expect buffer <name> to be pointer or tensor")
+or a host-side non-NULL pointer check.
+
+Note: Constructing a true DLTensor with NULL data in PyTorch is not typical; this script
+demonstrates passing None, which still reproduces the intended class of failure.
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = None  # attempt to pass a null-like pointer
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/10_scalar_type_mismatch.py b/maint/host_checks/10_scalar_type_mismatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2c90b8d1df8a51eb0bdffa94e2223ecbcf47fb
--- /dev/null
+++ b/maint/host_checks/10_scalar_type_mismatch.py
@@ -0,0 +1,15 @@
+"""Reproduce: scalar parameter type mismatch (int/bool)."""
+
+from common import build_scalar_check_kernel
+
+
+def main():
+    fn = build_scalar_check_kernel(target="cuda")
+
+    # Wrong types
+    fn(1.0, True)  # x should be int -> Expect arg[0] to be int
+    fn(1, 2.5)  # flag should be bool -> Expect arg[1] to be boolean
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/README.md b/maint/host_checks/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac23d6fd2ad5a44a3d4fcb98486ebf500375ce39
--- /dev/null
+++ b/maint/host_checks/README.md
@@ -0,0 +1,21 @@
+# Host-Side Check Repro Scripts
+
+This folder contains standalone scripts that deliberately trigger host-side (and adapter-side) validation errors described in `docs/compiler_internals/tensor_checks.md`. Each script can be run directly and will reproduce the corresponding error with a minimal example.
+
+Prerequisites
+- CUDA-capable environment (most scripts compile a CUDA-targeted kernel)
+- Python packages: torch, tilelang
+
+Usage
+- Run any script, e.g.:
+  - `python 01_num_args_mismatch.py`
+  - `python 02_pointer_type_error.py`
+  - ... up to `10_scalar_type_mismatch.py`
+
+- Or run all at once with a summary:
+  - `python run_all.py`
+  - Logs per test are saved under `logs/` as `<script>.out` / `<script>.err`.
+
+Notes
+- Scripts assume at least one CUDA device. For the device-id mismatch case (08), two GPUs are required; the script will skip with a note if only one is available.
+- The adapter raises some errors before the host stub (e.g., wrong input count). The messages are aligned with the host checks as far as possible.
diff --git a/maint/host_checks/common.py b/maint/host_checks/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dbac5481d7e3622b99cfefe15c3d31495a594b6
--- /dev/null
+++ b/maint/host_checks/common.py
@@ -0,0 +1,41 @@
+import tilelang
+import tilelang.language as T
+import torch
+
+
+def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def build_matmul_kernel(M=1024, N=1024, K=1024, target="cuda"):
+    """Compile and return a callable kernel that takes (A, B) and returns C."""
+    if target.startswith("cuda") and not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available; cannot build CUDA kernel for host-check repros.")
+    prim = make_matmul_prim(M, N, K)
+    # out_idx=[2] means the 3rd param C is treated as output; wrapper takes (A,B)
+    return tilelang.compile(prim, out_idx=[2], target=target)
+
+
+def build_scalar_check_kernel(target="cuda"):
+    @T.prim_func
+    def scalar_check(x: T.int32, flag: T.bool()):
+        T.evaluate(0)
+
+    return tilelang.compile(scalar_check, target=target)
diff --git a/maint/host_checks/run_all.py b/maint/host_checks/run_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fecd8b182f6d78820421cd52486a4eab44fd8cd
--- /dev/null
+++ b/maint/host_checks/run_all.py
@@ -0,0 +1,71 @@
+import sys
+import subprocess
+from pathlib import Path
+
+
+def main():
+    root = Path(__file__).resolve().parent
+    scripts = [
+        "01_num_args_mismatch.py",
+        "02_pointer_type_error.py",
+        "03_ndim_mismatch.py",
+        "04_dtype_mismatch.py",
+        "05_shape_mismatch.py",
+        "06_strides_mismatch.py",
+        "07_device_type_mismatch.py",
+        "08_device_id_mismatch.py",
+        "09_null_data_pointer.py",
+        "10_scalar_type_mismatch.py",
+    ]
+
+    logs_dir = root / "logs"
+    logs_dir.mkdir(exist_ok=True)
+
+    results = []
+    for name in scripts:
+        script_path = root / name
+        if not script_path.exists():
+            results.append((name, "MISSING", 0))
+            print(f"[MISSING] {name}")
+            continue
+
+        print(f"\n=== Running {name} ===")
+        proc = subprocess.run(
+            [sys.executable, str(script_path)],
+            cwd=str(root),
+            capture_output=True,
+            text=True,
+        )
+
+        # Save logs
+        (logs_dir / f"{name}.out").write_text(proc.stdout)
+        (logs_dir / f"{name}.err").write_text(proc.stderr)
+
+        out = (proc.stdout or "") + (proc.stderr or "")
+        if "[SKIP]" in out:
+            status = "SKIP"
+        elif proc.returncode != 0:
+            status = "PASS"  # error reproduced as expected
+        else:
+            status = "FAIL"  # no error observed
+
+        results.append((name, status, proc.returncode))
+        print(f"[{status}] {name} (rc={proc.returncode})")
+
+    # Summary
+    print("\n=== Summary ===")
+    counts = {"PASS": 0, "FAIL": 0, "SKIP": 0, "MISSING": 0}
+    for name, status, _ in results:
+        counts[status] = counts.get(status, 0) + 1
+        print(f"{status:7} {name}")
+
+    print("\nTotals:")
+    for k in ("PASS", "FAIL", "SKIP", "MISSING"):
+        print(f"  {k:7}: {counts.get(k, 0)}")
+
+    # Exit non-zero if any FAIL
+    sys.exit(1 if counts.get("FAIL", 0) else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
index 7d0d67db734d033fb4919b505dff13728aa96826..c77a67cfd1822585a9ea400bbd71bf3abb6800b5 100644
--- a/maint/precision/compare_ops.py
+++ b/maint/precision/compare_ops.py
@@ -37,7 +37,7 @@ OP_NAMES: Dict[int, str] = {
     6: "sqrt",
     7: "tanh",
     8: "rsqrt",
-    9: "inv_sqrt"
+    9: "inv_sqrt",
 }
 
 # Block sizes for kernels
@@ -49,8 +49,7 @@ TILELANG_THREADS = 128
 
 def parse_arguments() -> argparse.Namespace:
     """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Precision comparison tool for various CUDA implementations")
+    parser = argparse.ArgumentParser(description="Precision comparison tool for various CUDA implementations")
     parser.add_argument("--n", type=int, default=1000000, help="Number of elements to test")
     parser.add_argument("--low", type=float, default=-4.0, help="Lower bound for random values")
     parser.add_argument("--high", type=float, default=4.0, help="Upper bound for random values")
@@ -67,7 +66,7 @@ def initialize_cuda() -> torch.nn.Module:
     return load(
         name="cuda_ops",
         sources=["cuda_ops.cu"],
-        extra_cuda_cflags=[]  # No fast_math flags
+        extra_cuda_cflags=[],  # No fast_math flags
     )
 
 
@@ -149,8 +148,7 @@ def triton_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_S
 
 
 @triton.jit
-def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr,
-                                  BLOCK_SIZE: tl.constexpr):
+def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     """LibDevice Triton kernel for unary operations."""
     pid = tl.program_id(0)
     block_start = pid * BLOCK_SIZE
@@ -188,13 +186,10 @@ def make_tilelang_unary_kernel(M: int, N: int, op_id: int, use_fastmath: bool =
 
     @T.prim_func
     def tilelang_unary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -229,14 +224,11 @@ def make_tilelang_binary_kernel(M: int, N: int):
 
     @T.prim_func
     def tilelang_binary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
-            C: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
+        C: T.Tensor((M, N), T.float32),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -247,10 +239,7 @@ def make_tilelang_binary_kernel(M: int, N: int):
     return tilelang_binary_kernel
 
 
-def tilelang_op(x: torch.Tensor,
-                op_id: int,
-                y: Optional[torch.Tensor] = None,
-                use_fastmath: bool = False) -> torch.Tensor:
+def tilelang_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None, use_fastmath: bool = False) -> torch.Tensor:
     """TileLang operation interface."""
     assert x.is_cuda
 
@@ -272,7 +261,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x, y)
     else:  # Unary operation
         kernel_func = make_tilelang_unary_kernel(M, N, op_id, use_fastmath)
@@ -282,7 +272,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x)
 
     # Restore original shape
@@ -293,7 +284,7 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     """Standard Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -304,13 +295,11 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     return out
 
 
-def triton_libdevice_op(x: torch.Tensor,
-                        op_id: int,
-                        y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def triton_libdevice_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """LibDevice Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -321,9 +310,7 @@ def triton_libdevice_op(x: torch.Tensor,
     return out
 
 
-def get_pytorch_reference(x: torch.Tensor,
-                          op_id: int,
-                          y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def get_pytorch_reference(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """Get PyTorch reference implementation for the given operation."""
     if op_id == 0:
         assert y is not None, "Division requires second operand"
@@ -362,8 +349,10 @@ def summarize_error(tag: str, output: Optional[torch.Tensor], reference: torch.T
 
     abs_err = (output_double - reference_double).abs()
     rel_err = abs_err / (reference_double.abs().clamp_min(1e-30))
-    print(f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
-          f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}")
+    print(
+        f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
+        f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}"
+    )
 
 
 # Precision comparison function
@@ -407,9 +396,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
             results[name] = None
 
     # Print comparison header
-    print(
-        f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}"
-    )
+    print(f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}")
     print("-" * 90)
 
     # Compare all implementations against double precision reference
@@ -427,8 +414,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
         summarize_error(tag, output, ref_double)
 
 
-def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
-                       high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+def generate_test_data(op_id: int, n: int, device: torch.device, low: float, high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     """Generate appropriate test data for each operation."""
     if op_id == 0:  # Division
         x = torch.empty(n, device=device).uniform_(low, high)
@@ -450,9 +436,7 @@ def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
 
 def main() -> None:
     """Main execution function."""
-    print(
-        "Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang"
-    )
+    print("Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang")
     print("=" * 90)
 
     for op_id in range(len(OP_NAMES)):
diff --git a/maint/scripts/ci_performance.py b/maint/scripts/ci_performance.py
index 998e7b650f955af72f35c055feec4e8a43b69f59..8a353c0a999fb8848159ed3b21ec298cb3a47185 100644
--- a/maint/scripts/ci_performance.py
+++ b/maint/scripts/ci_performance.py
@@ -10,39 +10,32 @@ env["TILELANG_CLEAR_CACHE"] = "1"
 
 def parse_output(output):
     data = {}
-    for line in output.split('\n'):
+    for line in output.split("\n"):
         line = line.strip()
-        if line.startswith('Latency:'):
-            match = re.search(r'Latency: ([\d.]+)', line)
-            data['latency'] = match.group(1) if match else 'N/A'
-        elif line.startswith('TFlops:'):
-            match = re.search(r'TFlops: ([\d.]+)', line)
-            data['best_tflops'] = match.group(1) if match else 'N/A'
-        elif line.startswith('Config:'):
-            data['config'] = line.split('Config: ')[-1]
-        elif line.startswith('Reference TFlops:'):
-            match = re.search(r'Reference TFlops: ([\d.]+)', line)
-            data['ref_tflops'] = match.group(1) if match else 'N/A'
+        if line.startswith("Latency:"):
+            match = re.search(r"Latency: ([\d.]+)", line)
+            data["latency"] = match.group(1) if match else "N/A"
+        elif line.startswith("TFlops:"):
+            match = re.search(r"TFlops: ([\d.]+)", line)
+            data["best_tflops"] = match.group(1) if match else "N/A"
+        elif line.startswith("Config:"):
+            data["config"] = line.split("Config: ")[-1]
+        elif line.startswith("Reference TFlops:"):
+            match = re.search(r"Reference TFlops: ([\d.]+)", line)
+            data["ref_tflops"] = match.group(1) if match else "N/A"
     return data
 
 
-output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
+output_v1 = subprocess.run(["./tl/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
 data_v1 = parse_output(output_v1)
 
-output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
+output_v2 = subprocess.run(["./tll/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
 data_v2 = parse_output(output_v2)
 
-table = [[
-    "original", data_v1['latency'], data_v1['best_tflops'], data_v1['ref_tflops'], data_v1['config']
-], [
-    "current", data_v2['latency'], data_v2['best_tflops'], data_v2['ref_tflops'], data_v2['config']
-]]
+table = [
+    ["original", data_v1["latency"], data_v1["best_tflops"], data_v1["ref_tflops"], data_v1["config"]],
+    ["current", data_v2["latency"], data_v2["best_tflops"], data_v2["ref_tflops"], data_v2["config"]],
+]
 
 headers = ["version", "Best Latency (s)", "Best TFlops", "Reference TFlops", "Best Config"]
 
diff --git a/maint/scripts/docker_local_distribute.sh b/maint/scripts/docker_local_distribute.sh
index 02dbc19bd489d26e01e83660db8f95b9a8353f3f..2263066dc438d921d057fd09b031dd1753f81f09 100755
--- a/maint/scripts/docker_local_distribute.sh
+++ b/maint/scripts/docker_local_distribute.sh
@@ -2,4 +2,4 @@
 set -euxo pipefail
 
 # Build for local architecture
-CIBW_BUILD='cp38-*' cibuildwheel .
+CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
diff --git a/maint/scripts/docker_pypi_distribute.sh b/maint/scripts/docker_pypi_distribute.sh
index aa9ed9ab2ba475a896107e0abd8def2f950938fd..f8d746de17293897341dd72601d8bf7990d14a65 100755
--- a/maint/scripts/docker_pypi_distribute.sh
+++ b/maint/scripts/docker_pypi_distribute.sh
@@ -12,9 +12,8 @@ if docker buildx version >/dev/null 2>&1; then
     docker buildx use multi >/dev/null 2>&1 || true
   fi
   docker buildx inspect --bootstrap >/dev/null 2>&1 || true
-  done
 
   export CIBW_ARCHS='x86_64 aarch64'
 fi
 
-NO_VERSION_LABEL=ON CIBW_BUILD='cp38-*' cibuildwheel .
+NO_VERSION_LABEL=ON CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
diff --git a/maint/scripts/performance.py b/maint/scripts/performance.py
index 24c4a21e8d13e565e12963ced9c56387ed69ca58..d53c227f58afcc169dfd1113b893e8a9ce0c32af 100644
--- a/maint/scripts/performance.py
+++ b/maint/scripts/performance.py
@@ -8,19 +8,20 @@ def ref_program(A, B):
 
 
 def get_configs():
-    configs = [{
-        "block_M": 128,
-        "block_N": 128,
-        "block_K": 64,
-        "num_stages": 2,
-        "thread_num": 256,
-        "enable_rasteration": True,  # keep param name for backward-compat
-    }]
+    configs = [
+        {
+            "block_M": 128,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 256,
+            "enable_rasteration": True,  # keep param name for backward-compat
+        }
+    ]
     return configs
 
 
 def run(M, N, K):
-
     def kernel(
         block_M=None,
         block_N=None,
@@ -29,17 +30,16 @@ def run(M, N, K):
         thread_num=None,
         enable_rasteration=None,
     ):
-        dtype = "float16"
-        accum_dtype = "float"
+        dtype = T.float16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 B_shared = T.alloc_shared((block_N, block_K), dtype)
                 C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -60,12 +60,16 @@ def run(M, N, K):
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs()).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs())
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
+        )
+        .set_profile_args(
+            ref_prog=ref_program,
+        )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
diff --git a/maint/scripts/pypi.manylinux.Dockerfile b/maint/scripts/pypi.manylinux.Dockerfile
index 5ca694124526108ab597a08bddaada54fd1bcb61..de45df02eecd2a88c6c1f4400c8c9f7a725bc6b1 100644
--- a/maint/scripts/pypi.manylinux.Dockerfile
+++ b/maint/scripts/pypi.manylinux.Dockerfile
@@ -1,8 +1,8 @@
-FROM quay.io/pypa/manylinux2014_x86_64 AS builder_amd64
+FROM quay.io/pypa/manylinux_2_28_x86_64 AS builder_amd64
 
-RUN yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+RUN dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 
-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.8
 ENV CUDA_VERSION=${CUDA_VERSION}
 
 FROM quay.io/pypa/manylinux_2_28_aarch64 AS builder_arm64
diff --git a/maint/scripts/run_local_ci_test.sh b/maint/scripts/run_local_ci_test.sh
index f8fe5438462fc9dd8afd22bd2a3faf32632592b7..ef560437a0a5c3f4f86e3526cd8d5e5048813224 100755
--- a/maint/scripts/run_local_ci_test.sh
+++ b/maint/scripts/run_local_ci_test.sh
@@ -14,7 +14,13 @@ cd examples
 python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
 cd ..
 
-# Run pytest in parallel (4 workers) for all tests in the testing/python directory
+# Run pytest in parallel (4 workers) for all tests in the testing/python directory.
+# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
+# Do NOT export it globally here, or you'll silently change the default GEMM selection
+# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
 cd testing/python
-python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
+python -m pytest -n 4 . --ignore=jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
+
+# CuTeDSL JIT tests (isolate env + avoid xdist contention on a single GPU)
+TILELANG_USE_GEMM_V1=1 python -m pytest -n 1 jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
 cd ..
diff --git a/pyproject.toml b/pyproject.toml
index 8c417d5654f65166135ca2317ba509310688a6d0..e5c3494feb154c67bd0ee5d4c427af7a3acf1cc2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,14 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "apache-tvm-ffi==0.1.0",
+    "apache-tvm-ffi~=0.1.0",
+    # Extra constraint to tvm-ffi for abi issue,
+    # should be removed after our tvm's update.
+    # See discussion in tilelang#1373 and apache/tvm-ffi#307
+    "apache-tvm-ffi>=0.1.3",
+    # torch-c-dlpack-ext provides prebuilt torch extensions.
+    # Without it, TVM FFI may require JIT compilation on first import.
+    "torch-c-dlpack-ext",
     "cloudpickle",
     "ml-dtypes",
     "numpy>=1.23.5",
@@ -36,15 +43,25 @@ dependencies = [
     "torch>=2.7; platform_system == 'Darwin'",
     "tqdm>=4.62.3",
     "typing-extensions>=4.10.0",
+    "z3-solver>=4.13.0",
 ]
 
 [project.optional-dependencies]
 # mldtypes should be greater than 0.5.1
 # if you want to enable fp4
 fp4 = ["ml-dtypes>=0.5.1"]
+# if you want to enable layout inference visualization
+vis = ["matplotlib"]
 
 [build-system]
-requires = ["cython>=3.0.0", "scikit-build-core"]
+requires = [
+    "cython>=3.0.0",
+    "scikit-build-core",
+    "z3-solver>=4.13.0",
+    # Not for auditwheel, explicitly add patchelf for repairing libz3.so.
+    # See tvm's CMakeLists.txt for more information.
+    "patchelf>=0.17.2; platform_system == 'Linux'",
+]
 build-backend = "scikit_build_core.build"
 
 [tool.scikit-build]
@@ -104,6 +121,7 @@ tilelang = "tilelang"
 # TVM
 "tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
 "tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
+"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
 "tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
 # CUTLASS
 "tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
@@ -112,10 +130,7 @@ tilelang = "tilelang"
 "tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include"
 "tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library"
 
-[tool.yapf]
-based_on_style = "yapf"
-column_limit = 100
-indent_width = 4
+ 
 
 [tool.codespell]
 ignore-words = "docs/spelling_wordlist.txt"
@@ -128,7 +143,7 @@ skip = [
 
 [tool.ruff]
 target-version = "py39"
-line-length = 100
+line-length = 140
 output-format = "full"
 
 exclude = [
@@ -136,6 +151,14 @@ exclude = [
     "examples/deepseek_v32/inference",
 ]
 
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = false
+docstring-code-line-length = "dynamic"
+
 [tool.ruff.lint.per-file-ignores]
 # Do not upgrade type hint in testing and examples.
 # See https://github.com/tile-ai/tilelang/issues/1079 for more information.
@@ -211,12 +234,10 @@ environment.PYTHONDEVMODE = "1"
 environment.PYTHONUNBUFFERED = "1"
 environment.PATH = "/usr/local/cuda/bin:$PATH"
 environment.LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
-# Pin to glibc 2.17 for x86 and 2.28 for aarch64 for now
-# TODO: upgrade to manylinux_2_28 at some time
-manylinux-x86_64-image = "manylinux2014"   # CentOS 7
-manylinux-aarch64-image = "manylinux_2_28" # AlmaLinux 8
+manylinux-x86_64-image  = "manylinux_2_28"  # AlmaLinux 8
+manylinux-aarch64-image = "manylinux_2_34"  # Z3 requires
 # Install CUDA runtime and stub driver library
-# manylinux_2_28 uses gcc 14, which needs CUDA 12.8
+# manylinux_2_28 uses gcc 14, which needs CUDA >=12.8
 before-all = """
 set -eux
 
@@ -225,8 +246,8 @@ uname -a
 
 case "$(uname -m)" in
     "x86_64")
-        DEFAULT_CUDA_VERSION="12.1"
-        yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+        DEFAULT_CUDA_VERSION="12.8"
+        dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
         ;;
     "aarch64")
         DEFAULT_CUDA_VERSION="12.8"
@@ -240,9 +261,10 @@ esac
 cudaver="$(echo "${CUDA_VERSION:-$DEFAULT_CUDA_VERSION}" | cut -d '.' -f-2)"
 v="${cudaver//./-}"
 yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}" nvidia-driver-cuda-libs
+yum clean all
 """
 repair-wheel-command = [
-    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
+    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
     "pipx run abi3audit --verbose --strict {wheel}",
 ]
 
@@ -254,7 +276,8 @@ repair-wheel-command = [
 
 [[tool.cibuildwheel.overrides]]
 select = "*linux*x86_64*"
-# CentOS 7 is too old to run import test. Do wheel installation test only.
-test-command = [
-    "echo 'Wheel is installed successfully'",
+# x86_64 runners in GitHub Actions have limited storage,
+# pre-install torch without caching to reduce disk usage during install tilelang.
+before-test = [
+    "pip install torch --no-cache-dir",
 ]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6cd968731bc600146a571cbad87f9794c587a4ef..0cbc2f7fa1e0b503a343838550da4255a8b00a8d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # Requirements to run local build with `--no-build-isolation` or other developments
 
-apache-tvm-ffi~=0.1.0
+apache-tvm-ffi>=0.1.3
 build
 cmake>=3.26
 cython>=3.0.0
@@ -10,6 +10,7 @@ scikit-build-core
 setuptools>=61
 torch
 wheel
+z3-solver>=4.13.0
 
 auditwheel; platform_system == 'Linux'
 patchelf; platform_system == 'Linux'
diff --git a/requirements-lint.txt b/requirements-lint.txt
index e64eee16059b9b0e48f1237666c3cc6cab8c54f6..b68f81a2d3d4f25033a20f51456d85a49d8df622 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,7 +1,6 @@
 # Format and lint requirements
 pre-commit
-clang-format==21.1.2
-clang-tidy==21.1.1
+clang-format==21.1.7
+clang-tidy==21.1.6
 codespell[toml]==2.4.1
-ruff==0.14.3
-yapf==0.43.0
+ruff==0.14.9
diff --git a/requirements-test-cuda.txt b/requirements-test-cuda.txt
index 5413ad510168f459e0cd26e47170422f1f2b2a16..52a403aa99b49757738fa5b28879048b263199bb 100644
--- a/requirements-test-cuda.txt
+++ b/requirements-test-cuda.txt
@@ -6,3 +6,6 @@
 
 # CUDA specific requirements
 flash-attn==2.5.8
+cuda-python==12.9.4
+# CuTeDSL (CUTLASS Python DSL with CuTe support)
+nvidia-cutlass-dsl>=4.3.1
diff --git a/requirements-test.txt b/requirements-test.txt
index 38bdf2d7bbf6003687b362694827cb6ba968782b..78ee2cfc31d3783e0ad16361ce260fc88d67412d 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -30,3 +30,4 @@ scipy
 tabulate
 tornado
 wheel
+z3-solver>=4.13.0
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 3ad186ed43db91267b92c5826ee5a2e42ca6c132..c8dc33473c44654efa531258d9728f205b1c415d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 # Runtime requirements
-apache-tvm-ffi~=0.1.0
+apache-tvm-ffi>=0.1.3
+torch-c-dlpack-ext
 cloudpickle
 ml-dtypes
 numpy>=1.23.5
@@ -8,3 +9,4 @@ torch
 torch>=2.7; platform_system == 'Darwin'
 tqdm>=4.62.3
 typing-extensions>=4.10.0
+z3-solver>=4.13.0
\ No newline at end of file
diff --git a/src/ir.cc b/src/ir.cc
index 3d2b3ecdcf304609012b7e4c13ee9b191070dceb..82a94cb8e4ff71e795518151a6d8dfde7636a5bb 100644
--- a/src/ir.cc
+++ b/src/ir.cc
@@ -44,16 +44,22 @@ static ForFrame MakeIterVarFrame(const std::string &name, const PrimExpr &dom) {
   n->vars.push_back(var);
   n->doms.push_back(Range(0, dom));
   n->f_make_for_loop = [](const Array<Var> &vars, const Array<Range> &doms,
-                          const Stmt &body) -> Stmt {
+                          const Array<Optional<PrimExpr>> &steps,
+                          Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), 1);
     ICHECK_EQ(doms.size(), 1);
-    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
+               /*thread_binding=*/std::nullopt,
+               /*annotations=*/tvm::ffi::Map<tvm::ffi::String, tvm::ffi::Any>{},
+               /*step=*/step);
   };
   return ForFrame(n);
 }
 
 ForFrame ParallelFor(const Array<PrimExpr> &extents,
-                     const Map<String, ObjectRef> &annotations) {
+                     const Map<String, tvm::ffi::Any> &annotations) {
   using namespace tvm::tir;
   ObjectPtr<ForFrameNode> n = tvm::ffi::make_object<ForFrameNode>();
   n->vars.reserve(extents.size());
@@ -63,16 +69,19 @@ ForFrame ParallelFor(const Array<PrimExpr> &extents,
     n->vars.push_back(Var("v", extent.dtype()));
     n->doms.push_back(Range(make_const(dtype, 0), extent));
   }
-  n->f_make_for_loop = [annotations](const Array<Var> &vars,
-                                     const Array<Range> &doms,
-                                     Stmt body) -> Stmt {
+  n->f_make_for_loop =
+      [annotations](const Array<Var> &vars, const Array<Range> &doms,
+                    const Array<Optional<PrimExpr>> &steps, Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     for (int i = n - 1; i >= 0; --i) {
       Range dom = doms[i];
       Var var = vars[i];
+      Optional<PrimExpr> step =
+          i < steps.size() ? steps[i] : Optional<PrimExpr>(std::nullopt);
       body = For(var, dom->min, dom->extent, ForKind::kParallel, body,
-                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations);
+                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations,
+                 /*step=*/step);
     }
     return body;
   };
@@ -90,11 +99,12 @@ ForFrame PipelinedFor(PrimExpr start, const PrimExpr &stop, int num_stages,
   n->vars.push_back(Var("v", dtype));
   n->doms.push_back(Range(std::move(start), stop));
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
+                           const Array<Optional<PrimExpr>> &steps,
                            Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     ICHECK(n == 1);
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     if (num_stages > 0)
       anno.Set("num_stages", PrimExpr(num_stages));
     if (!order.empty())
@@ -105,8 +115,11 @@ ForFrame PipelinedFor(PrimExpr start, const PrimExpr &stop, int num_stages,
       anno.Set("tl_pipeline_sync", sync);
     if (!groups.empty())
       anno.Set("tl_pipeline_group", groups);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
     body = For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
-               /*thread_binding=*/std::nullopt, /*annotations=*/anno);
+               /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+               /*step=*/step);
     return body;
   };
   return ForFrame(n);
@@ -145,9 +158,10 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
   grouped_domain.push_back(group_size);
 
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
-                           const Stmt &body) -> Stmt {
+                           const Array<Optional<PrimExpr>> &steps,
+                           Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     Array<PrimExpr> idxs(grouped_domain.size(), PrimExpr());
     PrimExpr rem = loop_var * wave_size + index;
 
@@ -168,8 +182,11 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
     if (analyzer.CanProveGreaterEqual(waves, 2)) {
       new_body = SeqStmt({out_if, body});
     }
-    Stmt outer =
-        For(loop_var, 0, waves, ForKind::kSerial, new_body, std::nullopt, anno);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    Stmt outer = For(loop_var, 0, waves, ForKind::kSerial, new_body,
+                     /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+                     /*step=*/step);
     for (int i = 0; i < vars.size() - 1; ++i) {
       outer = tvm::tir::LetStmt(vars[i], idxs[i + 1], outer);
     }
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index 2ada9fd08e6490c1f7147eefecdb49d57e5a038c..63d9c04011f080e81c9f202fa42e13457b16dcb5 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -12,6 +12,8 @@
 #include <tvm/tir/stmt_functor.h>
 
 #include "arith/pattern_match.h"
+#include "tvm/node/functor.h"
+#include "tvm/node/repr_printer.h"
 #include "utils.h"
 
 namespace tvm {
@@ -78,7 +80,8 @@ void LayoutNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<LayoutNode>()
       .def_ro("input_size", &LayoutNode::input_size_)
-      .def_ro("forward_index", &LayoutNode::forward_index_);
+      .def_ro("forward_index", &LayoutNode::forward_index_)
+      .def("_DebugOutput", &LayoutNode::DebugOutput);
 }
 
 void LayoutNode::UpdateAnalyzer(arith::Analyzer *analyzer) const {
@@ -297,13 +300,17 @@ std::pair<Layout, arith::IterMapLevel> LayoutNode::InverseWithLevel() const {
 }
 
 Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
-                           arith::Analyzer *analyzer) const {
+                           arith::Analyzer *analyzer,
+                           const PrimExpr rescale_num,
+                           const PrimExpr rescale_den) const {
+
   // Fast path: if shape is the same, return the original layout
   if (StructuralEqual()(InputShape(), shape)) {
     return ffi::GetRef<Layout>(this);
   }
 
-  // Step 1. Prove the product of InputShape is equal to the product of shape
+  // Step 1. Prove the product relation holds under rescale:
+  //   prod(InputShape) * rescale_num == prod(shape) * rescale_den
   PrimExpr input_shape_product = Integer(1);
   for (const auto &dim : InputShape()) {
     input_shape_product *= dim;
@@ -317,8 +324,10 @@ Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
   // potential null dereference paths flagged by static analysis.
   arith::Analyzer fallback_analyzer;
   arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
-  ICHECK(az->CanProveEqual(input_shape_product, shape_product))
-      << "InputShape() = " << InputShape() << " shape = " << shape;
+  ICHECK(az->CanProveEqual(input_shape_product * rescale_num,
+                           shape_product * rescale_den))
+      << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den;
 
   // Step 2. Create new forward indices by reshaping
   // For each dimension in the new shape, we create a placeholder variable
@@ -339,13 +348,17 @@ Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
     }
     flat_index = flat_index + new_vars[i] * stride;
   }
+  // Convert new flat index (in units of new elements) to the old flat index
+  // (in units of old elements) using the rational rescale factor.
+  // old_flat = floor((flat_index * rescale_den) / rescale_num)
+  PrimExpr old_flat_index = floordiv(flat_index * rescale_den, rescale_num);
   // Step 4. Convert flat index back to original shape indices
   // For original shape [s0, s1, ..., sm]:
   // i0 = flat_index // (s1 * s2 * ... * sm)
   // i1 = (flat_index % (s1 * s2 * ... * sm)) // (s2 * s3 * ... * sm)
   // ...
   Array<PrimExpr> original_indices;
-  PrimExpr remaining = flat_index;
+  PrimExpr remaining = old_flat_index;
   for (size_t i = 0; i < InputShape().size(); ++i) {
     PrimExpr stride = Integer(1);
     for (size_t j = i + 1; j < InputShape().size(); ++j) {
@@ -373,7 +386,10 @@ Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
 }
 
 Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
-                             arith::Analyzer *analyzer) const {
+                             arith::Analyzer *analyzer,
+                             const PrimExpr rescale_num,
+                             const PrimExpr rescale_den) const {
+
   // Fast path: identical input shape, return self
   if (StructuralEqual()(InputShape(), shape)) {
     return ffi::GetRef<Fragment>(this);
@@ -390,8 +406,9 @@ Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
   // Use provided analyzer if present, otherwise a local fallback.
   arith::Analyzer fallback_analyzer;
   arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
-  ICHECK(az->CanProveEqual(input_prod, shape_prod))
+  ICHECK(az->CanProveEqual(input_prod * rescale_num, shape_prod * rescale_den))
       << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den
       << " input fragment layout is = " << DebugOutput();
 
   // 2) Build flat index from new-shape indices
@@ -414,9 +431,12 @@ Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
       stride = stride * shape[j];
     flat = flat + new_vars[i] * stride;
   }
+  // Convert to old flat index units using the rational rescale factor.
+  // old_flat = floor((flat * rescale_den) / rescale_num)
+  PrimExpr old_flat = floordiv(flat * rescale_den, rescale_num);
   // 3) Recover original indices from flat index
   Array<PrimExpr> orig_indices;
-  PrimExpr remain = flat;
+  PrimExpr remain = old_flat;
   for (size_t i = 0; i < InputShape().size(); ++i) {
     PrimExpr stride = Integer(1);
     for (size_t j = i + 1; j < InputShape().size(); ++j)
@@ -529,6 +549,12 @@ Fragment::Fragment(Array<PrimExpr> input_size, Array<PrimExpr> forward_index,
   data_ = std::move(n);
 }
 
+Fragment Fragment::FullyReplicated(Array<PrimExpr> shape,
+                                   PrimExpr thread_extent) {
+  return Fragment(shape, {}, ReplicationPlaceholder(), thread_extent,
+                  std::nullopt);
+}
+
 // which means the forward_thread is rep_var -> lambda i, rep: rep
 bool FragmentNode::IsCompletedReplicated() const {
   arith::Analyzer analyzer;
@@ -536,6 +562,52 @@ bool FragmentNode::IsCompletedReplicated() const {
                          ReplicationPlaceholder());
 }
 
+arith::IterMapResult FragmentNode::DetectInjective() const {
+  // lei:To perform injective check, we need to reverse the layout
+  // and use surjective check, now we use bijective check for convenience
+  // can be relaxed in future
+  arith::Analyzer analyzer;
+  // Build a flat indices array: [forward_thread_, forward_index_[...]]
+  Array<PrimExpr> indices;
+  indices.push_back(forward_thread_);
+  for (const auto &e : forward_index_) {
+    indices.push_back(e);
+  }
+
+  // Mirror Layout::InverseWithLevel(): if any participating shape is
+  // symbolic, relax to NoCheck and rely on runtime guards elsewhere.
+  auto collect_symbolic = [&](const Array<PrimExpr> &shape) {
+    Array<PrimExpr> symbolic_dims;
+    for (const auto &dim : shape) {
+      if (!as_const_int(dim)) {
+        symbolic_dims.push_back(dim);
+      }
+    }
+    return symbolic_dims;
+  };
+
+  Array<PrimExpr> symbolic_dims = collect_symbolic(InputShape());
+  Array<PrimExpr> output_shape = OutputShape();
+  symbolic_dims.insert(symbolic_dims.end(), output_shape.begin(),
+                       output_shape.end());
+  // Also consider replicate size for fragments
+  if (!as_const_int(ReplicateExtent())) {
+    symbolic_dims.push_back(ReplicateExtent());
+  }
+  symbolic_dims = collect_symbolic(symbolic_dims);
+
+  bool is_static_shape = symbolic_dims.empty();
+  auto level = is_static_shape ? arith::IterMapLevel::Bijective
+                               : arith::IterMapLevel::NoCheck;
+  if (!is_static_shape) {
+    DLOG(WARNING)
+        << "Fragment::DetectInjective on symbolic layout, falling back to "
+        << "NoCheck; symbolic dims: " << symbolic_dims;
+  }
+
+  return arith::DetectIterMap(indices, getVarMap(), 1, level, &analyzer);
+}
+
 PrimExpr FragmentNode::ThreadExtent() const {
   Array<PrimExpr> ret(OutputDim(), 1);
   arith::Analyzer analyzer;
@@ -653,8 +725,19 @@ void FragmentNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<FragmentNode>()
       .def_ro("forward_thread", &FragmentNode::forward_thread_)
-      .def_ro("replicate_size", &FragmentNode::replicate_size_);
-}
+      .def_ro("replicate_size", &FragmentNode::replicate_size_)
+      .def("_DebugOutput", &FragmentNode::DebugOutput);
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<FragmentNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const FragmentNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    })
+    .set_dispatch<LayoutNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const LayoutNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    });
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
diff --git a/src/layout/layout.h b/src/layout/layout.h
index 5fefde4153d7776c01ec305b3eca4dbeb6fa9e95..9cd23905a20052f9cc96a5c6a8b6fdca2f926932 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -6,6 +6,7 @@
 #ifndef TVM_TL_LAYOUT_LAYOUT_H_
 #define TVM_TL_LAYOUT_LAYOUT_H_
 
+#include <exception>
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/ffi/object.h>
@@ -18,6 +19,25 @@ namespace tl {
 
 using namespace tir;
 
+// Common layout-related exceptions
+class LayoutConflictException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LayoutConflictException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
+class LoopLayoutInjectiveException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LoopLayoutInjectiveException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
 class Layout;
 class Fragment;
 
@@ -42,8 +62,18 @@ public:
 
   virtual Layout Inverse() const;
 
+  // Reshape the layout to a new logical shape. When aliasing buffers of
+  // different dtypes, the element count may change while the underlying
+  // byte-size stays equal. Use rescale_num/rescale_den to represent the
+  // ratio between the old element size and the new element size in bytes.
+  // Specifically, define factor = rescale_num / rescale_den where:
+  //   new_num_elems = old_num_elems * factor
+  // For example, f32->i8 (4B -> 1B) uses rescale_num=4, rescale_den=1.
+  // i8->f32 (1B -> 4B) uses rescale_num=1, rescale_den=4.
   virtual Layout Reshape(const Array<PrimExpr> &shape,
-                         arith::Analyzer *analyzer) const;
+                         arith::Analyzer *analyzer,
+                         const PrimExpr rescale_num = Integer(1),
+                         const PrimExpr rescale_den = Integer(1)) const;
 
   virtual std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const;
 
@@ -86,7 +116,9 @@ public:
 
   Layout Inverse() const final;
 
-  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer) const;
+  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer,
+                 const PrimExpr rescale_num = Integer(1),
+                 const PrimExpr rescale_den = Integer(1)) const;
 
   std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const final;
 
@@ -116,6 +148,8 @@ public:
 
   bool IsCompletedReplicated() const;
 
+  arith::IterMapResult DetectInjective() const;
+
   static void RegisterReflection();
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Fragment", FragmentNode, LayoutNode);
@@ -141,6 +175,20 @@ public:
                    PrimExpr forward_thread, PrimExpr replicate_size,
                    Optional<Var> replicate_var);
 
+  /*!
+   * \brief Create a fully replicated fragment layout.
+   *
+   * A fully replicated fragment means all threads hold identical copies of the
+   * entire buffer. This is useful for index buffers or masks that need to be
+   * accessed uniformly across all threads.
+   *
+   * \param shape The shape of the buffer.
+   * \param thread_extent The number of threads.
+   * \return A Fragment where each thread has a complete copy of all elements.
+   */
+  TVM_DLL static Fragment FullyReplicated(Array<PrimExpr> shape,
+                                          PrimExpr thread_extent);
+
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fragment, Layout, FragmentNode);
 };
 
diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
index 57e0d8b78418e9c501249e6b40ddd75fefcb4dc8..6fa0c6b5313a6f9fea392cda8735bb51a293bc01 100644
--- a/src/op/atomic_add.cc
+++ b/src/op/atomic_add.cc
@@ -5,7 +5,7 @@
  */
 
 #include "./atomic_add.h"
-#include "./region.h"
+#include "utils.h"
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
@@ -26,32 +26,27 @@ using namespace tir;
  * @brief Construct an AtomicAdd operator from call arguments and a buffer map.
  *
  * Builds the internal AtomicAddNode, extracts the source and destination
- * regions and their backing Buffers from the first two call-style expressions
- * in `args` (via RegionOp), and stores them along with their ranges. If a third
- * argument is provided, it is interpreted as an integer immediate and stored as
- * the node's coalesced width.
+ * regions and their backing Buffers from the first two region-style expressions
+ * in `args` (BufferLoad/BufferRegion), and stores them along with their
+ * ranges. If a third argument is provided, it is interpreted as an integer
+ * immediate and stored as the node's coalesced width.
  *
  * @param args Call-style PrimExprs where:
  *             - args[0] is the source region call,
  *             - args[1] is the destination region call,
  *             - args[2] (optional) is an IntImm specifying coalesced width.
- * @param vmap Mapping from buffers used by RegionOp to concrete Buffer objects.
- *
  * Notes:
- * - The constructor checks that args[0] and args[1] are CallNodes.
+ * - The constructor checks that args[0] and args[1] are region-compatible.
  * - The constructed node is stored in this->data_.
  */
-AtomicAdd::AtomicAdd(Array<PrimExpr> args, BufferMap vmap) {
+AtomicAdd::AtomicAdd(Array<PrimExpr> args) {
   ObjectPtr<AtomicAddNode> node = tvm::ffi::make_object<AtomicAddNode>();
   Array<Range> rgs[2];
   Buffer bf[2];
   for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
   }
   std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
   std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
@@ -272,22 +267,22 @@ For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
   Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
 
+  Array<PrimExpr> new_args;
+
+  // Optional bounds predicates for src and dst
   PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
   PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
 
-  Array<PrimExpr> new_args;
-
+  // Load source value and cast to dst dtype if needed
   PrimExpr src_value = BufferLoad(src, src_indices);
   if (src->dtype != dst->dtype)
     src_value = Cast(dst->dtype, src_value);
-  if (src_predicate.defined())
-    src_value = if_then_else(src_predicate, src_value, make_zero(dst->dtype));
 
-  PrimExpr dst_value = BufferLoad(dst, dst_indices);
-  if (dst_predicate.defined())
-    dst_value = if_then_else(dst_predicate, dst_value, make_zero(dst->dtype));
+  // Build a pointer to destination element using tvm_access_ptr
+  PrimExpr dst_ptr = Call(DataType::Handle(), builtin::address_of(),
+                          {BufferLoad(dst, dst_indices)});
 
-  new_args.push_back(dst_value);
+  new_args.push_back(dst_ptr);
   new_args.push_back(src_value);
   new_args.push_back(memory_order);
 
@@ -544,7 +539,7 @@ Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return vectorized_thread_loop;
 }
 
-TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
+TIR_REGISTER_TL_TILE_OP(AtomicAdd, atomicadd)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -552,4 +547,4 @@ TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
 TVM_FFI_STATIC_INIT_BLOCK() { AtomicAddNode::RegisterReflection(); }
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/atomic_add.h b/src/op/atomic_add.h
index f3aaacdbe59c8cca0ab5a9850b1499a23bf3fe7d..c6beb70eb03a37eac3a5a8505808bf5ede736f6a 100644
--- a/src/op/atomic_add.h
+++ b/src/op/atomic_add.h
@@ -65,7 +65,7 @@ class AtomicAdd : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicAdd, TileOperator,
                                              AtomicAddNode);
-  TVM_DLL AtomicAdd(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL AtomicAdd(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index 7c408bea460069232dff428dfbfced6c7331fe19..f68bee0b7975b767e89ce0d224c64fe54a53a989 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -22,8 +22,6 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableSafeMemoryLegalize, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWarpSpecialized, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableThreadStorageSync, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kConfigIndexBitwidth, Integer);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDisableDynamicTailSplit, Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDynamicAlignment, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnableAggressiveSharedMemoryMerge, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kForceLetInline, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableFastMath, Bool);
@@ -34,6 +32,9 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableVectorize256, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWGMMA, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kStorageRewriteDetectInplace, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationEnable, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationFormats, String);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDeviceCompileFlags, ffi::Array<ffi::String>);
 
 DataType cuTensorMapType() { return DataType::UInt(8, 128); }
 
@@ -99,6 +100,12 @@ TIR_DEFINE_TL_BUILTIN(ieee_frsqrt)
 TIR_DEFINE_TL_BUILTIN(ieee_fdiv).set_num_inputs(3).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
+TIR_DEFINE_TL_BUILTIN(rng_init).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(rng_rand).set_num_inputs(0).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -344,5 +351,35 @@ TIR_DEFINE_TL_BUILTIN(tcgen05_mma_arrive)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(warp_reduce_sum)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_max)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_min)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_bitand)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+// __ldg(BufferLoad | Buffer, idx?) -> value
+// Treat as a pure call that returns the loaded value.
+TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
index c880e9c4709696c6c4bb4ea743f53d7a82754360..c7054a877a93dcd4f1f830f944d3f14d0913a96a 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -28,6 +28,10 @@ static constexpr const char *kWarpSpecializationScope =
 static constexpr const char *kCustomWarpSpecialization =
     "kCustomWarpSpecialization";
 static constexpr const char *kLocalVarInit = "tl.local_var_init";
+// A PrimFunc-level attribute carrying a list of handle Vars
+// that must NOT be marked with the restrict qualifier in codegen.
+// Type: Array<tir::Var>
+static constexpr const char *kNonRestrictParams = "tl.non_restrict_params";
 } // namespace attr
 
 static constexpr const char *kDebugMergeSharedMemoryAllocations =
@@ -51,14 +55,11 @@ static constexpr const char *kDisableWGMMA = "tl.disable_wgmma";
 static constexpr const char *kDisableShuffleElect = "tl.disable_shuffle_elect";
 static constexpr const char *kStorageRewriteDetectInplace =
     "tl.storage_rewrite_detect_inplace";
-/*!
- * \brief Whether to disable dynamic tail split
- *
- * kDisableDynamicTailSplit = "tl.disable_dynamic_tail_split"
- *
- */
-static constexpr const char *kDisableDynamicTailSplit =
-    "tl.disable_dynamic_tail_split";
+static constexpr const char *kLayoutVisualizationEnable =
+    "tl.layout_visualization_enable";
+static constexpr const char *kLayoutVisualizationFormats =
+    "tl.layout_visualization_formats";
+static constexpr const char *kDeviceCompileFlags = "tl.device_compile_flags";
 
 /*!
  * \brief Whether to disable thread storage synchronization
@@ -82,18 +83,6 @@ static constexpr const char *kDisableThreadStorageSync =
  */
 static constexpr const char *kForceLetInline = "tl.force_let_inline";
 
-/*!
- * \brief The size of the vectorized dimension in buffer, designed by user
- *
- * For example, if the vectorized dimension is 128 bits and the dtype of buffer
- * A[m, k] is float16, the size of the vectorized dimension (i.e. k) in buffer A
- * should be divisible by 8 (8 = 128 / 16).
- *
- * kDynamicAlignment = "tl.dynamic_alignment"
- *
- */
-static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
-
 /*!
  * \brief Get the type of the CUDA tensor map
  *
@@ -138,6 +127,10 @@ TVM_DLL const Op &ieee_frsqrt();
 // ieee_fdiv(x, y, rounding_mode) - IEEE-compliant division
 TVM_DLL const Op &ieee_fdiv();
 
+// random op
+TVM_DLL const Op &rng_init();
+TVM_DLL const Op &rng_rand();
+
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
  *
@@ -582,6 +575,49 @@ TVM_DLL const Op &device_assert();
  */
 TVM_DLL const Op &device_assert_with_msg();
 
+/*!
+ * \brief tilelang intrinsic for warp reduction sum.
+ */
+TVM_DLL const Op &warp_reduce_sum();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction max.
+ */
+TVM_DLL const Op &warp_reduce_max();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction min.
+ */
+TVM_DLL const Op &warp_reduce_min();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction bitand.
+ */
+TVM_DLL const Op &warp_reduce_bitand();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction bitor.
+ */
+TVM_DLL const Op &warp_reduce_bitor();
+
+/*!
+ * \brief tilelang intrinsic for CUDA read-only cache load (__ldg).
+ *
+ *  This op allows users to explicitly request a non-coherent cached load
+ *  from global memory on CUDA by emitting `__ldg(&ptr[idx])` for 32-bit
+ *  element types on supported architectures. It provides a direct way to
+ *  leverage the read-only data cache for performance-sensitive loads when
+ *  the compiler cannot infer `const __restrict__` automatically.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.__ldg(x[i])
+ *
+ *  The op takes one argument preferred as a BufferLoad identifying the
+ *  source element; alternatively, backends may support passing a Buffer and
+ *  index expression.
+ */
+TVM_DLL const Op &__ldg();
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 5d3529044ce26fd1ae570a758186e842a3b6e14d..066a09b105a8292a685bff47261b73a4196a2052 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -16,7 +16,7 @@
 #include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
-#include "region.h"
+#include "utils.h"
 
 #include "../target/cuda.h"
 #include "../target/utils.h"
@@ -57,7 +57,7 @@ static int to_CUtensorMapDataType(DataType dtype) {
     }
   } else if (dtype.is_bfloat16()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
-  } else if (dtype.is_float8_e4m3() || dtype.is_float8_e5m2()) {
+  } else if (dtype.is_float8()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
   } else if (dtype.is_int()) {
     switch (dtype.bits()) {
@@ -110,36 +110,32 @@ template <typename T> static Array<T> ReverseArray(Array<T> array) {
 /*!
  * \brief Construct a Copy operator node from call arguments and a buffer map.
  *
- * This constructor parses the first two entries of `args` as Call nodes
- * describing source and destination Regions (via RegionOp), extracts their
- * Buffers and Ranges, and stores them on the newly created CopyNode. It also
+ * This constructor parses the first two entries of `args` as regions
+ * (BufferLoad/BufferRegion), extracts their Buffers and Ranges, and stores
+ * them on the newly created CopyNode. It also
  * reads optional arguments:
  * - args[2] (IntImm): coalesced width (stored only if > 0),
  * - args[3] (Bool): disable TMA lowering flag,
  * - args[4] (IntImm): eviction policy.
  *
  * Preconditions:
- * - `args` must contain at least two Call-compatible PrimExpr entries
- * describing regions; an ICHECK will fail if they are not CallNodes.
+ * - `args` must contain at least two region-compatible PrimExpr entries
+ *   (BufferLoad/BufferRegion); ICHECK will fail otherwise.
  *
  * @param args Array of PrimExpr where:
  *   - args[0] is the source Region call,
  *   - args[1] is the destination Region call,
  *   - optional args[2..4] are coalesced width, disable_tma, and eviction
  * policy.
- * @param vmap BufferMap used to resolve RegionOp buffers and ranges.
  */
-Copy::Copy(Array<PrimExpr> args, BufferMap vmap) {
+Copy::Copy(Array<PrimExpr> args) {
   ObjectPtr<CopyNode> node = tvm::ffi::make_object<CopyNode>();
   Array<Range> rgs[2];
   Buffer bf[2];
   for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
   }
   std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
   std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
@@ -183,15 +179,95 @@ TileOperator CopyNode::Clone() const {
  * copy operation.
  */
 Array<IterVar> CopyNode::MakeIterVars() const {
+  // Choose the range set from the lowest-level memory scope between src and
+  // dst. Scope levels: global < shared/shared.dyn/shared.tmem < local.fragment
+  // (fragment)
+  auto scope_level = [](const Buffer &b) -> int {
+    String s = b.scope();
+    if (s == "local.fragment" || s == "local")
+      return 2;
+    if (s == "shared" || s == "shared.dyn" || s == "shared.tmem")
+      return 1;
+    // default to global level for unknown scopes
+    return 0;
+  };
+
+  int src_level = scope_level(src);
+  int dst_level = scope_level(dst);
+  bool base_is_src = (src_level >= dst_level);
+  const Array<Range> &base_ranges = base_is_src ? src_range : dst_range;
+
+  // Sanity check: when switching away from the original (src_range),
+  // ensure the chosen base ranges are not provably smaller than the original
+  // per dimension. This guards against generating undersized loop domains.
+  // Improved logic: use two pointers to traverse both base_ranges and
+  // src_range, skipping dimensions with extent == 1. The number of non-1
+  // extents must match.
+  arith::Analyzer analyzer;
+
+  size_t base_dim = 0, src_dim = 0;
+  while (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+    // Skip base extents that are 1
+    while (base_dim < base_ranges.size() &&
+           is_one(base_ranges[base_dim]->extent)) {
+      ++base_dim;
+    }
+    // Skip src extents that are 1
+    while (src_dim < src_range.size() && is_one(src_range[src_dim]->extent)) {
+      ++src_dim;
+    }
+    // Both indices now at non-1, or at end
+    if (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+      PrimExpr base_ext = base_ranges[base_dim]->extent;
+      PrimExpr src_ext = src_range[src_dim]->extent;
+      // Only fail if base extent is provably smaller than src extent
+      if (analyzer.CanProve(base_ext < src_ext)) {
+        std::ostringstream oss;
+        oss << "Selected loop range is smaller than original src range at "
+               "matched non-1 dimension: "
+            << "base(extent=" << base_ext
+            << ", scope=" << (base_is_src ? src.scope() : dst.scope())
+            << ", min=" << base_ranges[base_dim]->min
+            << ", base_dim=" << base_dim << ") < src(extent=" << src_ext
+            << ", min=" << src_range[src_dim]->min << ", src_dim=" << src_dim
+            << ", scope=" << src.scope() << ") for src=" << src->name
+            << ", dst=" << dst->name << "\n";
+        oss << "src buffer: " << src->name << ", scope=" << src.scope() << "\n";
+        oss << "dst buffer: " << dst->name << ", scope=" << dst.scope() << "\n";
+        oss << "base_ranges[" << base_dim
+            << "]: min=" << base_ranges[base_dim]->min
+            << ", extent=" << base_ext << "\n";
+        oss << "src_ranges[" << src_dim << "]: min=" << src_range[src_dim]->min
+            << ", extent=" << src_ext << "\n";
+        LOG(FATAL) << oss.str();
+      }
+      ++base_dim;
+      ++src_dim;
+    }
+  }
+
+  // Any remaining unmatched dimensions in either range must all have extent ==
+  // 1
+  while (base_dim < base_ranges.size()) {
+    ICHECK(is_one(base_ranges[base_dim]->extent))
+        << "base_ranges has extra non-1 extent at dim " << base_dim;
+    ++base_dim;
+  }
+  while (src_dim < src_range.size()) {
+    ICHECK(is_one(src_range[src_dim]->extent))
+        << "src_range has extra non-1 extent at dim " << src_dim;
+    ++src_dim;
+  }
+
   Array<IterVar> loop_vars;
   size_t idx = 0;
-  for (size_t i = 0; i < src_range.size(); i++) {
-    if (is_one(src_range[i]->extent))
+  for (size_t i = 0; i < base_ranges.size(); i++) {
+    if (is_one(base_ranges[i]->extent))
       continue;
-    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
+    Var var = Var(std::string{char('i' + idx)}, base_ranges[i]->extent->dtype);
     idx++;
     loop_vars.push_back(
-        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
+        {Range(0, base_ranges[i]->extent), var, IterVarType::kDataPar});
   }
   return loop_vars;
 }
@@ -250,6 +326,7 @@ PrimExpr CopyNode::MakePredicate(arith::Analyzer *analyzer,
                                  const Array<IterVar> &ivs,
                                  Array<PrimExpr> extents, int src_dst) const {
   Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+
   Array<PrimExpr> cond_list;
   ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
   size_t idx = 0;
@@ -302,7 +379,6 @@ For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
 
   for (const auto &iv : loop_vars)
     analyzer->Bind(iv->var, iv->dom);
-
   ICHECK(loop_vars.size() <= src_range.size())
       << "loop_vars.size() = " << loop_vars.size()
       << ", src_range.size() = " << src_range.size() << ", src = " << src->name
@@ -475,16 +551,38 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     // This must be a global/shared layout, so we can skip the parallel op
     // layout inference (parallel layout inference only annotate the loop layout
     // and the register layout).
-    bool is_load = copy_inst == CopyInst::kBulkLoad;
+    bool is_load =
+        copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkLoad1D;
     Buffer global_tensor = is_load ? src : dst;
     Buffer shared_tensor = is_load ? dst : src;
+
+    Map<Buffer, Layout> result_map;
+
+    // Collect fragment buffers from indices and mark them as fully replicated
+    // For Bulk Load/Store, fragment buffers used as indices should be
+    // replicated across all threads
+    PrimExpr thread_extent = T.thread_bounds->extent;
+    for (const auto &range : src_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+    for (const auto &range : dst_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+
     // check shared layout is non-swizzle
     // skip layout inference if shared layout is already annotated
     if (level == InferLevel::kFree && !T.layout_map.count(shared_tensor)) {
       // create a new layout map for tma linear layout
       Layout linear_layout = ComputeLinearLayout(shared_tensor);
-      return Map<Buffer, Layout>({{shared_tensor, linear_layout}});
+      result_map.Set(shared_tensor, linear_layout);
     }
+    return result_map;
   }
   // for LDSM/STSM, the layout was deduced from register layout
   // so we can directly apply the layout of normal copy
@@ -493,7 +591,8 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     arith::Analyzer analyzer;
     par_op_ = ParallelOp((MakeSIMTLoop(&analyzer)));
   }
-  return par_op_->InferLayout(T, level);
+  auto layout_map = par_op_->InferLayout(T, level);
+  return layout_map;
 }
 /**
  * @brief Determine whether this CopyNode can be lowered to a Bulk Load (TMA)
@@ -851,21 +950,31 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
   For vectorized_thread_loop;
   auto par_op = ParallelOp(transformed_loop);
 
-  if (is_cpu_target) {
+  if (is_cpu_target || dst.scope() == "local" || src.scope() == "local") {
+    if (src.scope() == "local" && dst.scope() != "local") {
+      LOG(WARNING) << "Copy from local buffer `" << src->name << "` to "
+                   << dst.scope() << " buffer `" << dst->name
+                   << "` may cause conflicted write.";
+    }
     vectorized_thread_loop = VectorizeLoop(transformed_loop);
   } else {
     std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
                                       InferLevel::kFree};
     for (auto level : levels) {
-      par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                           false, T.buffer_remap},
+      par_op->InferLayout({T.target,
+                           T.thread_bounds,
+                           T.layout_map,
+                           analyzer,
+                           false,
+                           T.buffer_remap,
+                           {}},
                           level);
     }
     auto loop_layout = par_op->GetLoopLayout();
     auto thread_var = T.thread_var;
     auto thread_loop =
         PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer, loop_layout);
-    vectorized_thread_loop = VectorizeLoop(thread_loop);
+    vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
   }
 
   if (par_op->GetPredicate(T.thread_var).defined()) {
@@ -1117,6 +1226,11 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   bool is_ld = false; // tcgen05.ld (tensor memory -> register)
   bool is_st = false; // tcgen05.st (register -> tensor memory)
   bool is_cp = false; // tcgen05.cp (shared memory -> tensor memory)
+  bool src_needs_pack =
+      16 == src->dtype.bits(); // if needs .pack::16b when is_ld
+  bool dst_needs_unpack =
+      16 == dst->dtype.bits(); // if needs .unpack::16b when is_st
+
   if (src.scope() == "shared.tmem" && dst.scope() == "local.fragment") {
     is_ld = true;
   } else if (src.scope() == "local.fragment" && dst.scope() == "shared.tmem") {
@@ -1124,9 +1238,8 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   } else if (src.scope() == "shared.dyn" && dst.scope() == "shared.tmem") {
     is_cp = true;
   } else {
-    ICHECK(0) << "Unsupported tensor memory copy: "
-              << "src scope = " << src.scope()
-              << ", dst scope = " << dst.scope();
+    ICHECK(0) << "Unsupported tensor memory copy: " << "src scope = "
+              << src.scope() << ", dst scope = " << dst.scope();
   }
   // Currently tcgen05.cp is not supported
   // TODO (mzw) Support tcgen05.cp
@@ -1246,8 +1359,10 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
               : relative_wg_idx * (num_chunks_each_wg * meta.width);
       have_succeeded = true;
       Array<PrimExpr> args;
+      const char *bool_str = src_needs_pack ? "true" : "false";
       args.push_back(StringImm(meta.intrinsics_name + "<" +
-                               std::to_string(num_chunks_each_wg) + ">"));
+                               std::to_string(num_chunks_each_wg) + ", " +
+                               bool_str + ">"));
       args.push_back(
           BufferLoad(src, {(int)logical_row_min,
                            (int)logical_col_min})); // Will be translated later
@@ -1724,20 +1839,21 @@ Array<PrimExpr> TMADesc::EncodeCallArgs() const {
  * GPU intrinsics.
  *
  * @param args Array of PrimExpr TL-call arguments (see list above).
- * @param vmap Mapping from original buffer variables to actual Buffer objects.
  */
-Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap) {
+Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args) {
   ObjectPtr<Conv2DIm2ColOpNode> node =
       tvm::ffi::make_object<Conv2DIm2ColOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
-  node->nhw_step = args[2];
-  node->c_step = args[3];
-  node->kernel = args[4].as<IntImm>().value()->value;
-  node->stride = args[5].as<IntImm>().value()->value;
-  node->dilation = args[6].as<IntImm>().value()->value;
-  node->padding = args[7].as<IntImm>().value()->value;
-  node->eviction_policy = args[8].as<IntImm>().value()->value;
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src_ = node->srcRegion_->buffer;
+  node->dst_ = node->dstRegion_->buffer;
+  node->nhw_step_ = args[2];
+  node->c_step_ = args[3];
+  node->kernel_ = args[4].as<IntImm>().value()->value;
+  node->stride_ = args[5].as<IntImm>().value()->value;
+  node->dilation_ = args[6].as<IntImm>().value()->value;
+  node->padding_ = args[7].as<IntImm>().value()->value;
+  node->eviction_policy_ = args[8].as<IntImm>().value()->value;
   data_ = std::move(node);
 }
 
@@ -1788,24 +1904,24 @@ TileOperator Conv2DIm2ColOpNode::Clone() const {
 Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   ICHECK(TargetIsHopper(T.target));
-  ICHECK(src.scope() == "global" &&
-         (dst.scope() == "shared.dyn" || dst.scope() == "shared"));
-  ICHECK(src->shape.size() == 4);
-  ICHECK(dst->shape.size() == 2);
-  ICHECK(src->dtype == dst->dtype);
+  ICHECK(src_.scope() == "global" &&
+         (dst_.scope() == "shared.dyn" || dst_.scope() == "shared"));
+  ICHECK(src_->shape.size() == 4);
+  ICHECK(dst_->shape.size() == 2);
+  ICHECK(src_->dtype == dst_->dtype);
   Layout shared_layout;
-  if (T.layout_map.count(dst)) {
-    shared_layout = T.layout_map[dst];
+  if (T.layout_map.count(dst_)) {
+    shared_layout = T.layout_map[dst_];
   }
 
   TMAIm2ColDesc desc;
-  desc.rank = src->shape.size();
-  desc.data_type = to_CUtensorMapDataType(src->dtype);
-  desc.global_addr = src->data;
-  desc.global_shape = ReverseArray(src->shape);
+  desc.rank = src_->shape.size();
+  desc.data_type = to_CUtensorMapDataType(src_->dtype);
+  desc.global_addr = src_->data;
+  desc.global_shape = ReverseArray(src_->shape);
 
-  if (!src->strides.empty()) {
-    desc.global_stride = ReverseArray(src->strides);
+  if (!src_->strides.empty()) {
+    desc.global_stride = ReverseArray(src_->strides);
   } else {
     // Create stride from shape
     PrimExpr stride = 1;
@@ -1819,13 +1935,13 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
   ICHECK(is_one(desc.global_stride[0])) << desc.global_stride;
   // Make global stride in bytes
   desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
-    return cast(DataType::Int(64), e) * src->dtype.bytes();
+    return cast(DataType::Int(64), e) * src_->dtype.bytes();
   });
-  desc.elem_stride = {1, stride, stride, 1};
-  desc.lower_corner = {-padding, -padding};
-  desc.upper_corner = {-padding, -padding};
-  desc.smem_box_pixel = Downcast<IntImm>(dst->shape[0])->value;
-  desc.smem_box_channel = Downcast<IntImm>(dst->shape[1])->value;
+  desc.elem_stride = {1, stride_, stride_, 1};
+  desc.lower_corner = {-padding_, -padding_};
+  desc.upper_corner = {-padding_, -padding_};
+  desc.smem_box_pixel = Downcast<IntImm>(dst_->shape[0])->value;
+  desc.smem_box_channel = Downcast<IntImm>(dst_->shape[1])->value;
   desc.l2_promotion = static_cast<int>(CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
   desc.oob_fill = static_cast<int>(CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
   desc.interleave = static_cast<int>(CU_TENSOR_MAP_INTERLEAVE_NONE);
@@ -1839,15 +1955,15 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
 
     if (StructuralEqual()(shared_layout,
                           makeQuarterBankSwizzleLayout(*stride, *continuous,
-                                                       dst->dtype.bits()))) {
+                                                       dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
     } else if (StructuralEqual()(shared_layout, makeHalfBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
     } else if (StructuralEqual()(shared_layout, makeFullBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
     } else {
       ICHECK(0) << "Cannot detect TMA layout.";
@@ -1866,43 +1982,43 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
       << "Currently can only support divisible channel case";
 
   global_coords.push_back(
-      FloorMod(c_step * desc.smem_box_channel, desc.global_shape[0]));
+      FloorMod(c_step_ * desc.smem_box_channel, desc.global_shape[0]));
   image_offset.push_back(
-      dilation *
-      FloorMod(FloorDiv(c_step * desc.smem_box_channel, desc.global_shape[0]),
-               kernel));
-  image_offset.push_back(dilation * FloorDiv(c_step * desc.smem_box_channel,
-                                             desc.global_shape[0] * kernel));
+      dilation_ *
+      FloorMod(FloorDiv(c_step_ * desc.smem_box_channel, desc.global_shape[0]),
+               kernel_));
+  image_offset.push_back(dilation_ * FloorDiv(c_step_ * desc.smem_box_channel,
+                                              desc.global_shape[0] * kernel_));
 
   PrimExpr h_dim =
-      FloorDiv(src->shape[1] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[1] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   PrimExpr w_dim =
-      FloorDiv(src->shape[2] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[2] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   global_coords.push_back(
-      stride * FloorMod(nhw_step * desc.smem_box_pixel, w_dim) - padding);
+      stride_ * FloorMod(nhw_step_ * desc.smem_box_pixel, w_dim) - padding_);
   global_coords.push_back(
-      stride *
-          FloorMod(FloorDiv(nhw_step * desc.smem_box_pixel, w_dim), h_dim) -
-      padding);
+      stride_ *
+          FloorMod(FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim), h_dim) -
+      padding_);
   global_coords.push_back(
-      FloorDiv(nhw_step * desc.smem_box_pixel, w_dim * h_dim));
+      FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim * h_dim));
 
   Array<PrimExpr> args;
   args.reserve(desc.rank * 2 + 2);
   args.push_back(create_desc);
   args.push_back(0); // mbar placeholder
-  auto dst_buffer = T.buffer_remap.count(dst) ? T.buffer_remap[dst] : dst;
+  auto dst_buffer = T.buffer_remap.count(dst_) ? T.buffer_remap[dst_] : dst_;
   auto shared_addr = dst_buffer.access_ptr(2);
   args.push_back(shared_addr);
   for (auto coord : global_coords)
     args.push_back(coord);
   for (auto offset : image_offset)
     args.push_back(offset);
-  args.push_back(this->eviction_policy);
+  args.push_back(this->eviction_policy_);
   Stmt tma_copy =
       IfThenElse(EQ(T.thread_var, T.thread_bounds->min),
                  Evaluate(Call(DataType::Handle(), tma_load_im2col(), args)));
@@ -1944,12 +2060,37 @@ Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
   return args;
 }
 
+void CopyNode::CollectFragmentLayouts(const PrimExpr &expr,
+                                      const Map<Var, PrimExpr> &let_var_to_expr,
+                                      const LayoutMap &existing_layouts,
+                                      PrimExpr thread_extent,
+                                      Range thread_bounds,
+                                      Map<Buffer, Layout> &result_map) const {
+  PostOrderVisit(expr, [&](const ObjectRef &node) {
+    if (auto bl = node.as<BufferLoadNode>()) {
+      if (bl->buffer.scope() == "local.fragment" &&
+          !existing_layouts.count(bl->buffer) &&
+          !result_map.count(bl->buffer)) {
+        auto f = Fragment::FullyReplicated(bl->buffer->shape, thread_extent);
+        result_map.Set(bl->buffer, f->BindThreadRange(thread_bounds));
+      }
+    } else if (auto var_node = node.as<VarNode>()) {
+      auto var = tvm::ffi::GetRef<Var>(var_node);
+      if (let_var_to_expr.count(var)) {
+        CollectFragmentLayouts(let_var_to_expr[var], let_var_to_expr,
+                               existing_layouts, thread_extent, thread_bounds,
+                               result_map);
+      }
+    }
+  });
+}
+
 // Register the Copy operation with TVM's TIR system
 // This makes the copy operation available for use in TVM programs
 // - Takes 5 inputs: src_buffer, dst_buffer, coalesced_width, disable_tma,
 // eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Copy, copy)
+TIR_REGISTER_TL_TILE_OP(Copy, copy)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -1974,7 +2115,7 @@ LayoutMap Conv2DIm2ColOpNode::InferLayout(const LayoutInferArgs &T,
 // - Takes 9 inputs: src_buffer, dst_buffer, nhw_step, c_step, kernel, stride,
 // dilation, padding, eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Conv2DIm2ColOp, c2d_im2col)
+TIR_REGISTER_TL_TILE_OP(Conv2DIm2ColOp, c2d_im2col)
     .set_num_inputs(9)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/copy.h b/src/op/copy.h
index ef46b9edbde7175a91de39de47c35204ae4f2dad..aca629f5c6e0ed4a8b36ab2d5b646e4648e4aefb 100644
--- a/src/op/copy.h
+++ b/src/op/copy.h
@@ -269,6 +269,28 @@ protected:
    * @return Reference to the singleton TVM Op representing this operator.
    */
   TileOperator Clone() const;
+
+private:
+  /*!
+   * \brief Collect fragment buffers from expression and create fully replicated
+   * layouts.
+   *
+   * Recursively searches the expression for BufferLoad nodes with
+   * "local.fragment" scope, following let bindings. For each found fragment
+   * buffer, creates a fully replicated layout and adds it to result_map.
+   *
+   * \param expr            Expression to search.
+   * \param let_var_to_expr Map from let variables to their bound expressions.
+   * \param existing_layouts Existing layout map to check for already-inferred
+   * layouts. \param thread_extent   Number of threads for replication. \param
+   * thread_bounds   Thread bounds for binding the layout. \param result_map
+   * Output map to store collected fragment layouts.
+   */
+  void CollectFragmentLayouts(const PrimExpr &expr,
+                              const Map<Var, PrimExpr> &let_var_to_expr,
+                              const LayoutMap &existing_layouts,
+                              PrimExpr thread_extent, Range thread_bounds,
+                              Map<Buffer, Layout> &result_map) const;
 };
 
 class Copy : public TileOperator {
@@ -280,7 +302,7 @@ public:
    * \param args  Expression arguments for the copy.
    * \param vmap  Buffer variable mapping.
    */
-  TVM_DLL Copy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Copy(Array<PrimExpr> args);
 
   /*!
    * \brief Get the TVM Op handle corresponding to this Copy op.
@@ -296,14 +318,16 @@ public:
  */
 class Conv2DIm2ColOpNode : public TileOperatorNode {
 public:
-  Buffer src, dst; // Source (input feature map) and destination (im2col matrix)
-  int stride;      // Stride for convolution
-  int padding;     // Padding amount
-  int dilation;    // Dilation factor
-  int kernel;      // Kernel size
-  int eviction_policy; // Cache eviction policy
-  PrimExpr nhw_step;   // Step size in NHW dimensions
-  PrimExpr c_step;     // Step size in channel dimension
+  BufferRegion srcRegion_, dstRegion_;
+  Buffer src_,
+      dst_;      // Source (input feature map) and destination (im2col matrix)
+  int stride_;   // Stride for convolution
+  int padding_;  // Padding amount
+  int dilation_; // Dilation factor
+  int kernel_;   // Kernel size
+  int eviction_policy_; // Cache eviction policy
+  PrimExpr nhw_step_;   // Step size in NHW dimensions
+  PrimExpr c_step_;     // Step size in channel dimension
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Conv2DIm2Col", Conv2DIm2ColOpNode,
                                     TileOperatorNode);
@@ -311,13 +335,15 @@ public:
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<Conv2DIm2ColOpNode>()
-        .def_ro("src", &Conv2DIm2ColOpNode::src)
-        .def_ro("dst", &Conv2DIm2ColOpNode::dst)
-        .def_ro("stride", &Conv2DIm2ColOpNode::stride)
-        .def_ro("padding", &Conv2DIm2ColOpNode::padding)
-        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation)
-        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel)
-        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy);
+        .def_ro("srcRegion", &Conv2DIm2ColOpNode::srcRegion_)
+        .def_ro("dstRegion", &Conv2DIm2ColOpNode::dstRegion_)
+        .def_ro("src", &Conv2DIm2ColOpNode::src_)
+        .def_ro("dst", &Conv2DIm2ColOpNode::dst_)
+        .def_ro("stride", &Conv2DIm2ColOpNode::stride_)
+        .def_ro("padding", &Conv2DIm2ColOpNode::padding_)
+        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation_)
+        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel_)
+        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy_);
   }
 
   /*!
@@ -342,7 +368,7 @@ class Conv2DIm2ColOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Conv2DIm2ColOp, TileOperator,
                                              Conv2DIm2ColOpNode);
-  TVM_DLL Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Conv2DIm2ColOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 83b0842dc110a7b83f6f6923b8d874940d9de027..794b38401e08351ad10faf29f3336a6fe494267f 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -17,7 +17,7 @@
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
 #include "builtin.h"
-#include "region.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -52,62 +52,18 @@ using namespace tir;
  * value].
  *             - args[0]: destination access (BufferLoad or pointer expression).
  *             - args[1]: value to fill (scalar or vector).
- * @param vmap Mapping from buffer variables to Buffer objects; used to resolve
- * the destination when args[0] is not a BufferLoad.
  *
  * Notes:
  * - The constructor enforces constraints (e.g., stride == 1 ramps, constant
  * lanes) and will terminate (via CHECK/ICHECK) if inputs are unsupported or out
  * of bounds.
  */
-Fill::Fill(Array<PrimExpr> args, BufferMap vmap) {
+Fill::Fill(Array<PrimExpr> args) {
   ObjectPtr<FillNode> node = tvm::ffi::make_object<FillNode>();
 
-  // Case 1: Region descriptor call (tl.region)
-  if (const auto *call = args[0].as<CallNode>()) {
-    if (call->op.same_as(RegionOp::Get())) {
-      auto region = RegionOp(call->args, vmap);
-      node->dst = region->GetBuffer();
-      node->region = region->GetRanges();
-    } else if (call->op.same_as(builtin::tvm_access_ptr())) {
-      node->dst = vmap[GetVarFromAccessPtr(args[0])];
-      for (int i = 0; i < node->dst->shape.size(); i++) {
-        node->region.push_back(Range(0, node->dst->shape[i]));
-      }
-    } else {
-      ICHECK(false) << "Unsupported call op in tl.fill: "
-                    << Downcast<Op>(call->op)->name;
-    }
-
-    // Case 2: Explicit BufferRegion (legacy path)
-  } else if (args[0]->IsInstance<BufferRegionNode>()) {
-    auto region = Downcast<BufferRegion>(args[0]);
-    node->dst = region->buffer;
-    node->region = region->region;
-
-    // Case 3: Vector/scalar region expressed via BufferLoad indices
-  } else if (args[0]->IsInstance<BufferLoadNode>()) {
-    auto buffer_load = Downcast<BufferLoad>(args[0]);
-    for (const auto &index : buffer_load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        CHECK(ramp->stride.as<IntImmNode>()->value == 1)
-            << "Only stride 1 ramps are supported";
-        const auto *lanes = ramp->lanes.as<IntImmNode>();
-        CHECK(lanes)
-            << "Scalable vectors not supported in BufferRegion conversion";
-        node->region.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        node->region.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    node->dst = buffer_load->buffer;
-    // Case 4: Access pointer, fill the full buffer
-  } else {
-    node->dst = vmap[GetVarFromAccessPtr(args[0])];
-    for (int i = 0; i < node->dst->shape.size(); i++) {
-      node->region.push_back(Range(0, node->dst->shape[i]));
-    }
-  }
+  BufferRegion region = NormalizeToBufferRegion(args[0]);
+  node->dst = region->buffer;
+  node->region = region->region;
 
   if (args[1]->dtype != node->dst->dtype) {
     node->value = Cast(node->dst->dtype, args[1]);
@@ -202,12 +158,17 @@ For FillNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
 Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   if (dst.scope() == "local.fragment") {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
@@ -215,17 +176,22 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     return vectorized_thread_loop;
   } else if (dst.scope() == "local") {
     auto init_loop = MakeSIMTLoop(analyzer);
-    auto vectorized_thread_loop = VectorizeLoop(init_loop);
+    auto vectorized_thread_loop = VectorizeLoop(init_loop, analyzer);
     return vectorized_thread_loop;
   } else if (dst.scope() == "shared.dyn" || dst.scope() == "shared" ||
              dst.scope() == "global") {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
@@ -253,7 +219,7 @@ LayoutMap FillNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(Fill, fill)
+TIR_REGISTER_TL_TILE_OP(Fill, fill)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/fill.h b/src/op/fill.h
index 8f1dd9006620abb3f7615868d7a4214b9b8397c8..c10a5cfb1b516cf305feb7c58a29c8d20a09e6ae 100644
--- a/src/op/fill.h
+++ b/src/op/fill.h
@@ -45,7 +45,7 @@ private:
 class Fill : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fill, TileOperator, FillNode);
-  TVM_DLL Fill(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Fill(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/finalize_reducer.cc b/src/op/finalize_reducer.cc
index 84b18897b13a03961e4f86564822f4e8c491834e..f542b2d917b5b05999f1275cb96b533d73a947b4 100644
--- a/src/op/finalize_reducer.cc
+++ b/src/op/finalize_reducer.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/op_attr_types.h>
 
 #include "../target/utils.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -29,12 +30,14 @@ using namespace tir;
  * @param args TL operator arguments: expects at least two elements where
  *             `args[0]` is an access pointer identifying the reducer variable
  * and `args[1]` is an integer encoding a `ReducerOpType` (e.g., Sum/Max/Min).
- * @param vmap Mapping from variables to Buffers used to look up the reducer
- * Buffer.
  */
-FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
+FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args) {
   auto node = tvm::ffi::make_object<FinalizeReducerOpNode>();
-  node->reducer = vmap[GetVarFromAccessPtr(args[0])];
+  // Normalize any supported region expression
+  // (BufferRegion/BufferLoad/tl.region) to a BufferRegion, then take the
+  // underlying Buffer as reducer.
+  auto region = NormalizeToBufferRegion(args[0]);
+  node->reducer = region->buffer;
   node->op = (ReducerOpType)*as_const_int(args[1]);
   data_ = std::move(node);
 }
@@ -156,7 +159,7 @@ TileOperator FinalizeReducerOpNode::Clone() const {
   return TileOperator(node);
 }
 
-TIR_REGISTER_TL_OP(FinalizeReducerOp, finalize_reducer)
+TIR_REGISTER_TL_TILE_OP(FinalizeReducerOp, finalize_reducer)
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/finalize_reducer.h b/src/op/finalize_reducer.h
index ef49ee194bb244a3f8c80e15e29c886e1ee4c3a8..99e1e7cbfd516880a8c7411575663c3623f94136 100644
--- a/src/op/finalize_reducer.h
+++ b/src/op/finalize_reducer.h
@@ -48,7 +48,7 @@ class FinalizeReducerOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(FinalizeReducerOp, TileOperator,
                                              FinalizeReducerOpNode);
-  TVM_DLL FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL FinalizeReducerOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index 389fdaca3285f0ca8331cceef4c9b1ec067e82c6..bdec7eeebc7fe1ee506644bb894a8ae5152f85fc 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -12,8 +12,8 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
-#include "region.h"
 #include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -41,106 +41,21 @@ using namespace tir;
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-// Normalize a GEMM argument (BufferRegion/BufferLoad/tvm_access_ptr/tl.region)
-// to BufferRegion
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
-
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in GEMM region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in GEMM region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
-
-  // Case 3: Call nodes
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap[var];
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
-  }
-
-  LOG(FATAL) << "Unsupported GEMM argument for BufferRegion: " << arg;
-  throw; // Unreachable, keeps compiler happy
-}
-
-// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
-// BufferRegion. Offset is computed from all but the last two dimensions; extent
-// is the product of the last two extents. rw_mask: 1=read, 2=write,
-// 3=readwrite.
-static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
-                                        int rw_mask) {
-  Buffer buf = region->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-  ICHECK(ndim >= 2) << "GEMM expects buffers with at least 2 dims";
-
-  // Compute row-major strides
-  std::vector<PrimExpr> strides(ndim);
-  PrimExpr one = make_const(buf->shape[0].dtype(), 1);
-  PrimExpr cur = one;
-  for (int i = ndim - 1; i >= 0; --i) {
-    strides[i] = cur;
-    cur = cur * buf->shape[i];
-  }
-
-  // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
-  PrimExpr offset = make_const(buf->shape[0].dtype(), 0);
-  for (int i = 0; i < ndim - 2; ++i) {
-    offset = offset + region->region[i]->min * strides[i];
-  }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-  // Extent: last two extents product (elements)
-  PrimExpr extent =
-      region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
-  // ptype and return handle
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-}
-
-Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
+Gemm::Gemm(Array<PrimExpr> args) {
   ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->bRegion_ = NormalizeToBufferRegion(args[1], vmap);
-  node->cRegion_ = NormalizeToBufferRegion(args[2], vmap);
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
 
   node->a_ = node->aRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -165,11 +80,14 @@ Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
   if (args.size() > 15) {
     node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  node->mbarPtr_ = args[16];
-  if (node->mbarPtr_.as<CallNode>()) {
-    node->mbar_ = vmap[GetVarFromAccessPtr(node->mbarPtr_)];
-  } else {
-    node->mbar_ = std::nullopt;
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    } else {
+      node->mbar_ = std::nullopt;
+    }
   }
   node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
@@ -443,13 +361,7 @@ bool GemmNode::checkWgmma() const {
   if (c_->dtype == DataType::Float(16)) {
     if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
       return k_ % 16 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -462,13 +374,7 @@ bool GemmNode::checkWgmma() const {
     else if (a_->dtype == DataType::Float(32) &&
              b_->dtype == DataType::Float(32))
       return (!transA_) && transB_ && k_ % 8 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -535,9 +441,12 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
 
   // Build access pointers from regions locally
-  PrimExpr Aptr = MakeAccessPtrFromRegion(aRegion_, /*r*/ 1);
-  PrimExpr Bptr = MakeAccessPtrFromRegion(bRegion_, /*r*/ 1);
-  PrimExpr Cptr = MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3);
+  PrimExpr Aptr =
+      MakeAccessPtrFromRegion(aRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Bptr =
+      MakeAccessPtrFromRegion(bRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Cptr =
+      MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3, /*require_2d*/ true);
 
   std::stringstream ss;
   std::string op_name;
@@ -579,11 +488,13 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
     auto C_buffer = T.buffer_remap.count(c_) ? T.buffer_remap[c_] : c_;
     Array<PrimExpr> new_args;
+    auto mbarPtr =
+        MakeAccessPtrFromRegion(mbarRegion_, /*rw*/ 3, /*require_2d*/ true);
     new_args.push_back(StringImm(ss.str()));
     new_args.push_back(Aptr);
     new_args.push_back(Bptr);
     new_args.push_back(BufferLoad(C_buffer, cCoords_));
-    new_args.push_back(mbarPtr_);
+    new_args.push_back(mbarPtr);
     new_args.push_back(clearAccum_);
     auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);
 
@@ -908,7 +819,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(Gemm, gemm)
+TIR_REGISTER_TL_TILE_OP(Gemm, gemm)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/gemm.h b/src/op/gemm.h
index 1c9760550b265d7655440f8f32b7dd38e8821cde..3ec58becc903c04348220278d1ce797b1241d967 100644
--- a/src/op/gemm.h
+++ b/src/op/gemm.h
@@ -97,7 +97,7 @@ public:
   // only will be enabled under cdna mfma instructions
   int kPack_ = 1;
   int wgWait_ = 0;
-  PrimExpr mbarPtr_;
+  BufferRegion mbarRegion_;
   std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   mutable GemmWarpPolicy policy_;
@@ -144,7 +144,7 @@ private:
 class Gemm : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Gemm, TileOperator, GemmNode);
-  TVM_DLL Gemm(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Gemm(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index ac506ee093614fa7cde861ff812a23242eee6a80..6953779a1edf2d8f8a48e293d9b5ce3bfdcb7d2a 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -12,100 +12,17 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
-#include "region.h"
 #include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-// Normalize a GEMM argument (BufferRegion/BufferLoad/tvm_access_ptr/tl.region)
-// to BufferRegion
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
-
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in GEMM region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in GEMM region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
-
-  // Case 3: Call nodes
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap.at(var);
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
-  }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-  LOG(FATAL) << "Unsupported GEMM argument for BufferRegion: " << arg;
-  throw; // Unreachable, keeps compiler happy
-}
-
-// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
-// BufferRegion. Offset is computed from all but the last two dimensions; extent
-// is the product of the last two extents. rw_mask: 1=read, 2=write,
-// 3=readwrite.
-static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
-                                        int rw_mask) {
-  Buffer buf = region->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-  ICHECK(ndim >= 2) << "GEMM expects buffers with at least 2 dims";
-
-  // Compute row-major strides
-  std::vector<PrimExpr> strides(ndim);
-  PrimExpr one = make_const(buf->shape[0].dtype(), 1);
-  PrimExpr cur = one;
-  for (int i = ndim - 1; i >= 0; --i) {
-    strides[i] = cur;
-    cur = cur * buf->shape[i];
-  }
-
-  // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
-  PrimExpr offset = make_const(buf->shape[0].dtype(), 0);
-  for (int i = 0; i < ndim - 2; ++i) {
-    offset = offset + region->region[i]->min * strides[i];
-  }
-
-  // Extent: last two extents product (elements)
-  PrimExpr extent =
-      region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
-
-  // ptype and return handle
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-}
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
 /**
  * @brief Construct a Gemm operator from serialized TL arguments and a buffer
@@ -128,19 +45,17 @@ static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
+GemmPy::GemmPy(Array<PrimExpr> args) {
   ObjectPtr<GemmPyNode> node = tvm::ffi::make_object<GemmPyNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->bRegion_ = NormalizeToBufferRegion(args[1], vmap);
-  node->cRegion_ = NormalizeToBufferRegion(args[2], vmap);
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
 
   node->a_ = node->aRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -165,11 +80,12 @@ GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
   if (args.size() > 15) {
     node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  node->mbarPtr_ = args[16];
-  if (node->mbarPtr_.as<CallNode>()) {
-    node->mbar_ = vmap[GetVarFromAccessPtr(node->mbarPtr_)];
-  } else {
-    node->mbar_ = std::nullopt;
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    }
   }
   node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
@@ -219,7 +135,7 @@ GemmInst GemmPyNode::getGemmInst(int block_size, Target target) const {
     return GemmInst::kMFMA;
   } else if (TargetIsVolta(target) || TargetIsAmpere(target) ||
              TargetIsTuring(target) || TargetIsHopper(target) ||
-             TargetIsSm100(target)) {
+             TargetIsSm100(target) || TargetIsSM120(target)) {
     return GemmInst::kMMA;
   } else {
     ICHECK(0) << "Unsupported target for gemm: " << target->str();
@@ -266,13 +182,7 @@ bool GemmPyNode::checkWgmma() const {
   if (c_->dtype == DataType::Float(16)) {
     if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
       return k_ % 16 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -285,13 +195,7 @@ bool GemmPyNode::checkWgmma() const {
     else if (a_->dtype == DataType::Float(32) &&
              b_->dtype == DataType::Float(32))
       return (!transA_) && transB_ && k_ % 8 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -402,7 +306,7 @@ LayoutMap GemmPyNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmPy, gemm_py)
+TIR_REGISTER_TL_TILE_OP(GemmPy, gemm_py)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -428,6 +332,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           result.push_back(Integer(meta.atom_m));
           result.push_back(Integer(meta.atom_n));
           result.push_back(Integer(meta.atom_k));
+          result.push_back(Integer(meta.enable_ws));
+          result.push_back(Integer(meta.enable_2cta));
         }
         return result;
       });
diff --git a/src/op/gemm_py.h b/src/op/gemm_py.h
index 0678588e8a2256e8e83efde69289277620066056..2fe47be881ff963cabf3d069ba9457dd2b5330fd 100644
--- a/src/op/gemm_py.h
+++ b/src/op/gemm_py.h
@@ -29,8 +29,8 @@ public:
   int strideA_, strideB_;
   int offsetA_, offsetB_;
   PrimExpr clearAccum_ = const_false();
-  PrimExpr mbarPtr_;
-  std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
+  BufferRegion mbarRegion_;
+  tir::Buffer mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
@@ -59,7 +59,8 @@ public:
         .def_ro("offsetA", &GemmPyNode::offsetA_)
         .def_ro("offsetB", &GemmPyNode::offsetB_)
         .def_ro("clearAccum", &GemmPyNode::clearAccum_)
-        .def_ro("mbarPtr", &GemmPyNode::mbarPtr_)
+        .def_ro("mbarRegion", &GemmPyNode::mbarRegion_)
+        .def_ro("mbar", &GemmPyNode::mbar_)
         .def_ro("cCoords", &GemmPyNode::cCoords_)
         .def_ro("kPack", &GemmPyNode::kPack_)
         .def_ro("wgWait", &GemmPyNode::wgWait_)
@@ -82,7 +83,7 @@ private:
 class GemmPy : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmPy, TileOperator, GemmPyNode);
-  TVM_DLL GemmPy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL GemmPy(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index 52a119e032b5bc02b06a48b90a657e623ce7986b..4c0ae08b9ef62e8801f9e47b2979ff9bafe31007 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -14,6 +14,7 @@
 #include "../target/utils.h"
 #include "builtin.h"
 #include "gemm.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -79,16 +80,19 @@ std::pair<int, int> GemmSPWarpPolicyNode::computeWarpPartition(int M, int N,
  * The populated GemmSPNode is stored in the instance's internal data_ pointer.
  *
  * @param args Positional TL call arguments in the above order.
- * @param vmap BufferMap mapping access pointers (from args) to Buffer objects.
  *
  * @note An ICHECK failure is raised if a provided kPack is not 1 or 2.
  */
-GemmSP::GemmSP(Array<PrimExpr> args, BufferMap vmap) {
+GemmSP::GemmSP(Array<PrimExpr> args) {
   ObjectPtr<GemmSPNode> node = tvm::ffi::make_object<GemmSPNode>();
-  node->a_ = vmap[GetVarFromAccessPtr(args[0])];
-  node->e_ = vmap[GetVarFromAccessPtr(args[1])];
-  node->b_ = vmap[GetVarFromAccessPtr(args[2])];
-  node->c_ = vmap[GetVarFromAccessPtr(args[3])];
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+  node->a_ = node->aRegion_->buffer;
+  node->e_ = node->eRegion_->buffer;
+  node->b_ = node->bRegion_->buffer;
+  node->c_ = node->cRegion_->buffer;
   node->transA_ = args[4].as<Bool>().value();
   node->transB_ = args[5].as<Bool>().value();
   node->m_ = args[6].as<IntImm>().value()->value;
@@ -298,12 +302,25 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmSP, gemm_sp)
+TIR_REGISTER_TL_TILE_OP(GemmSP, gemm_sp)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK() { GemmSPNode::RegisterReflection(); }
+TVM_REGISTER_OP("tl.GemmSPWarpPolicy")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmSPWarpPolicy");
 
+TVM_FFI_STATIC_INIT_BLOCK() {
+  GemmSPNode::RegisterReflection();
+  GemmSPWarpPolicyNode::RegisterReflection();
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def(
+      "tl.GemmSPWarpPolicyComputeWarpPartition",
+      [](GemmSPWarpPolicy policy, int M, int N, int block_size, Target target,
+         bool use_wgmma, int bits) {
+        policy->computeWarpPartition(M, N, block_size, target, use_wgmma, bits);
+        return;
+      });
+}
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/gemm_sp.h b/src/op/gemm_sp.h
index 1eb535a53c9c7679bf0ed9c54708a3ca1a0df6a1..a634e922fed37f98d75a3c541f3ad17e420407df 100644
--- a/src/op/gemm_sp.h
+++ b/src/op/gemm_sp.h
@@ -23,6 +23,14 @@ public:
                                            int bits) const;
   TVM_FFI_DECLARE_OBJECT_INFO("tl.GemmSPWarpPolicy", GemmSPWarpPolicyNode,
                               GemmWarpPolicyNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPWarpPolicyNode>()
+        .def_ro("policy_type", &GemmSPWarpPolicyNode::policy_type)
+        .def_ro("m_warp", &GemmSPWarpPolicyNode::m_warp)
+        .def_ro("n_warp", &GemmSPWarpPolicyNode::n_warp);
+  }
 };
 
 class GemmSPWarpPolicy : public ObjectRef {
@@ -53,6 +61,7 @@ public:
 
 class GemmSPNode : public TileOperatorNode {
 public:
+  BufferRegion aRegion_, bRegion_, cRegion_, eRegion_;
   tir::Buffer a_, b_, c_, e_;
   bool transA_, transB_;
   int m_, n_, k_;
@@ -75,6 +84,10 @@ public:
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<GemmSPNode>()
         .def_ro("policy", &GemmSPNode::policy_)
+        .def_ro("aRegion", &GemmSPNode::aRegion_)
+        .def_ro("bRegion", &GemmSPNode::bRegion_)
+        .def_ro("cRegion", &GemmSPNode::cRegion_)
+        .def_ro("eRegion", &GemmSPNode::eRegion_)
         .def_ro("a", &GemmSPNode::a_)
         .def_ro("b", &GemmSPNode::b_)
         .def_ro("c", &GemmSPNode::c_)
@@ -96,7 +109,7 @@ private:
 class GemmSP : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSP, TileOperator, GemmSPNode);
-  TVM_DLL GemmSP(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL GemmSP(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_sp_py.cc b/src/op/gemm_sp_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ad8ca9b5075eebad4cbd6db4d9a3209a207344c
--- /dev/null
+++ b/src/op/gemm_sp_py.cc
@@ -0,0 +1,289 @@
+/*!
+ * \file tl/op/gemm_sp_py.cc
+ * \brief Implementation of Sparse General Matrix Multiplication (GEMM_SP)
+ * operators
+ */
+
+#include "gemm_sp_py.h"
+#include "utils.h"
+
+#include "builtin.h"
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/transform.h>
+
+#include "../target/utils.h"
+#include "tvm/ffi/string.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/**
+ * @brief Construct a Gemm operator from serialized TL arguments and a buffer
+ * map.
+ *
+ * This constructor deserializes operator parameters from `args` and resolves
+ * buffer references via `vmap`, populating an internal GemmSPPyNode with:
+ * - device pointers for A, E, B, C and their corresponding Buffer objects,
+ * - transpose flags for A and B,
+ * - matrix dimensions M, N, K,
+ * - warp allocation policy and clear_accum flag,
+ * - strides and memory offsets for A and B,
+ * - optional kPack (must be 1 or 2) and optional wg_wait.
+ *
+ * The populated GemmSPPyNode is stored into the wrapper's internal `data_`.
+ *
+ * @param args Positional serialized arguments produced by the TL frontend:
+ *   expected layout is:
+ *     [Aptr, Eptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
+ *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
+ *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
+ *      (optional) kPack (Int), (optional) wg_wait (Int)]
+ * @param vmap Mapping from access pointer vars to Buffer objects used to
+ *   resolve the Buffer corresponding to each pointer argument.
+ *
+ * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
+ *       fails with an ICHECK (runtime assertion). No other validation is
+ *       performed here.
+ */
+GemmSPPy::GemmSPPy(Array<PrimExpr> args) {
+  ObjectPtr<GemmSPPyNode> node = tvm::ffi::make_object<GemmSPPyNode>();
+
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+
+  node->A = node->aRegion_->buffer;
+  node->E = node->eRegion_->buffer;
+  node->B = node->bRegion_->buffer;
+  node->C = node->cRegion_->buffer;
+
+  node->trans_A = args[4].as<Bool>().value();
+  node->trans_B = args[5].as<Bool>().value();
+  node->trans_E = args[6].as<Bool>().value();
+  node->M = args[7].as<IntImm>().value()->value;
+  node->N = args[8].as<IntImm>().value()->value;
+  node->K = args[9].as<IntImm>().value()->value;
+  node->policy = GemmWarpPolicy(args[10].as<IntImm>().value()->value);
+  node->clear_accum = args[11].as<PrimExpr>().value();
+  node->stride_A = args[12].as<IntImm>().value()->value;
+  node->stride_B = args[13].as<IntImm>().value()->value;
+  node->offset_A = args[14].as<IntImm>().value()->value;
+  node->offset_B = args[15].as<IntImm>().value()->value;
+  if (args.size() > 16) {
+    node->kPack = args[16].as<IntImm>().value()->value;
+    if (node->kPack != 1 && node->kPack != 2) {
+      ICHECK(false) << "kPack must be 1 or 2";
+    }
+  }
+  if (args.size() > 17) {
+    node->wg_wait = args[17].as<IntImm>().value()->value;
+  }
+  data_ = std::move(node);
+}
+
+/**
+ * @brief Create a copy of this GemmSPPyNode as a TileOperator.
+ *
+ * Constructs a new GemmSPPyNode by copying the current node state and returns
+ * it wrapped in a GemmSPPy TileOperator.
+ *
+ * @return TileOperator A GemmSPPy operator that owns a copy of this node.
+ */
+TileOperator GemmSPPyNode::Clone() const {
+  auto op = tvm::ffi::make_object<GemmSPPyNode>(*this);
+  return GemmSPPy(op);
+}
+
+GemmInst GemmSPPyNode::GetGemmInst(int block_size, Target target) const {
+  int warp_size = TargetGetWarpSize(target);
+  int num_warps = block_size / warp_size;
+  bool allow_wgmma = TargetIsHopper(target) && (this->M >= 64) &&
+                     (num_warps % 4 == 0) && CheckWGMMA();
+  if (allow_wgmma) {
+    return GemmInst::kWGMMA;
+  } else if (TargetIsCDNA(target)) {
+    return GemmInst::kMFMA;
+  } else if (TargetIsCuda(target)) {
+    return GemmInst::kMMA;
+  } else {
+    ICHECK(0) << "Unsupported target for gemm: " << target->str();
+  }
+}
+
+/**
+ * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
+ *
+ * Evaluates device-memory placement, data-type combinations, transpose flags,
+ * and K divisibility constraints required for the Hopper WGMMA code path.
+ *
+ * The check returns true only when:
+ * - B resides in shared memory ("shared" or "shared.dyn"); and
+ * - (C, A, B) dtypes match one of the supported combinations below and K
+ *   satisfies the required alignment; and
+ * - for combinations that require specific orientations, A is not transposed
+ *   and B is transposed.
+ *
+ * Supported combinations and constraints:
+ * - C=float16:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
+ * 32 == 0
+ * - C=float32:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - A=bfloat16, B=bfloat16: K % 16 == 0
+ *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
+ *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
+ * - C=int32:
+ *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
+ * and K % 32 == 0
+ *
+ * @return true if WGMMA is supported for the current buffers, dtypes, and
+ *         transpose/shape constraints; false otherwise.
+ */
+bool GemmSPPyNode::CheckWGMMA() const {
+  return false; // not supported yet
+  // if (B.scope() != "shared.dyn" && B.scope() != "shared") {
+  //   return false;
+  // }
+
+  // if (C->dtype == DataType::Float(16)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Float(32)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::BFloat(16) &&
+  //            B->dtype == DataType::BFloat(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::Float(32) && B->dtype ==
+  //   DataType::Float(32))
+  //     return (!trans_A) && trans_B && K % 8 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Int(32)) {
+  //   if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else {
+  //   return false;
+  // }
+}
+
+/**
+ * @brief Parse and return the numeric GPU architecture from a Target's "arch"
+ * attribute.
+ *
+ * Examines the target's "arch" string and, if it matches the pattern
+ * "sm_<num>", returns <num> as an int. If the attribute is present but does not
+ * match that pattern, returns 0.
+ *
+ * Preconditions: the target must have an "arch" attribute (this is checked via
+ * ICHECK).
+ *
+ * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
+ * the arch string does not match "sm_<num>".
+ */
+static int GetArchInt(Target target) {
+  int arch_int = 0;
+  auto s = target->GetAttr<String>("arch");
+  ICHECK(s.has_value());
+  std::string arch = s.value();
+  if (arch.rfind("sm_", 0) == 0) {
+    arch_int = std::stoi(arch.substr(3));
+  } else {
+    arch_int = 0;
+  }
+  return arch_int;
+}
+
+Stmt GemmSPPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  auto block_size = *as_const_int(T.thread_bounds->extent);
+  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+
+  auto [warp_m, warp_n] =
+      policy->computeWarpPartition(M, N, block_size, T.target, gemm_inst);
+
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.lower")) {
+    auto prim_func =
+        Downcast<PrimFunc>((*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target,
+                                T.thread_bounds, T.thread_var));
+    ICHECK(prim_func->attrs.defined());
+    auto global_symbol = prim_func->attrs.GetAttr<String>("global_symbol");
+    ICHECK(global_symbol.has_value());
+    if (prim_func->body.as<BlockRealizeNode>()) {
+      BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
+      auto block = block_realize->block;
+      {
+        BlockNode *n = block.CopyOnWrite();
+        n->name_hint = global_symbol.value();
+      }
+      return BlockRealize(block_realize->iter_values, block_realize->predicate,
+                          block);
+    }
+    // warp with block realize node
+    return BlockRealize(
+        /*iter_values=*/Array<PrimExpr>(),
+        /*predicate=*/const_true(),
+        /*block=*/
+        Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
+              /*name_hint=*/global_symbol.value(), prim_func->body));
+  } else {
+    LOG(FATAL) << "No lower function found for gemm_sp_py";
+  }
+}
+
+LayoutMap GemmSPPyNode::InferLayout(const LayoutInferArgs &T,
+                                    InferLevel level) const {
+  if (completed_)
+    return {};
+  LayoutMap results;
+
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.infer_layout")) {
+    results = Downcast<LayoutMap>(
+        (*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target, T.thread_bounds));
+  } else {
+    LOG(FATAL) << "No infer layout function found for gemm_sp_py";
+  }
+
+  completed_ = true;
+  return results;
+}
+
+TIR_REGISTER_TL_TILE_OP(GemmSPPy, gemm_sp_py)
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() { GemmSPPyNode::RegisterReflection(); }
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f79c5e156abc933fabb070a6cbbaa8ba1ac883e
--- /dev/null
+++ b/src/op/gemm_sp_py.h
@@ -0,0 +1,94 @@
+/*!
+ * \file tl/op/gemm_sp_py.h
+ * \brief Define gemm_sp_py operator.
+ *
+ */
+
+// TODO: @botbw: remove redundant code with gemm_py.h
+
+#ifndef TVM_TL_OP_GEMM_SP_PY_H_
+#define TVM_TL_OP_GEMM_SP_PY_H_
+
+#include "gemm_sp.h"
+#include "operator.h"
+
+namespace tvm {
+
+namespace tl {
+
+using namespace tir;
+
+class GemmSPPyNode : public TileOperatorNode {
+public:
+  bool CheckWGMMA() const;
+  tir::Buffer A, E, B, C;
+  // pointer to the A, E, B, C
+  BufferRegion aRegion_, eRegion_, bRegion_, cRegion_;
+  bool trans_A, trans_B, trans_E;
+  int M, N, K;
+  int stride_A, stride_B;
+  int offset_A, offset_B;
+  PrimExpr clear_accum = const_false();
+  // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
+  // only will be enabled under cdna mfma instructions
+  int kPack = 1;
+  int wg_wait = 0;
+
+  // use GemmWarp Policy here as the atom size are flexible in v2
+  mutable GemmWarpPolicy policy;
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.GemmSPPy", GemmSPPyNode,
+                                    TileOperatorNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPPyNode>()
+        .def_ro("A", &GemmSPPyNode::A)
+        .def_ro("E", &GemmSPPyNode::E)
+        .def_ro("B", &GemmSPPyNode::B)
+        .def_ro("C", &GemmSPPyNode::C)
+        .def_ro("aRegion", &GemmSPPyNode::aRegion_)
+        .def_ro("eRegion", &GemmSPPyNode::eRegion_)
+        .def_ro("bRegion", &GemmSPPyNode::bRegion_)
+        .def_ro("cRegion", &GemmSPPyNode::cRegion_)
+        .def_ro("trans_A", &GemmSPPyNode::trans_A)
+        .def_ro("trans_B", &GemmSPPyNode::trans_B)
+        .def_ro("trans_E", &GemmSPPyNode::trans_E)
+        .def_ro("M", &GemmSPPyNode::M)
+        .def_ro("N", &GemmSPPyNode::N)
+        .def_ro("K", &GemmSPPyNode::K)
+        .def_ro("stride_A", &GemmSPPyNode::stride_A)
+        .def_ro("stride_B", &GemmSPPyNode::stride_B)
+        .def_ro("offset_A", &GemmSPPyNode::offset_A)
+        .def_ro("offset_B", &GemmSPPyNode::offset_B)
+        .def_ro("clear_accum", &GemmSPPyNode::clear_accum)
+        .def_ro("kPack", &GemmSPPyNode::kPack)
+        .def_ro("wg_wait", &GemmSPPyNode::wg_wait)
+        .def_ro("policy", &GemmSPPyNode::policy);
+  }
+
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+
+  TileOperator Clone() const;
+
+private:
+  // Target GEMM instruction
+  GemmInst GetGemmInst(int block_size, Target target) const;
+
+  mutable bool completed_ = false;
+};
+
+class GemmSPPy : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSPPy, TileOperator,
+                                             GemmSPPyNode);
+  TVM_DLL GemmSPPy(Array<PrimExpr> args);
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif //  TVM_TL_OP_GEMM_SP_PY_H_
\ No newline at end of file
diff --git a/src/op/math.cc b/src/op/math.cc
index 2de21b918c83822aacc68b03646bb6ad76789459..b9de966ea8255caf5925170efcc9bc59a69cfd89 100644
--- a/src/op/math.cc
+++ b/src/op/math.cc
@@ -33,6 +33,7 @@ TVM_REGISTER_OP("tl.pow_of_int")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "pow_of_int")
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", pow_of_int_op)
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", pow_of_int_op);
 
 PrimExpr infinity_op(PrimExpr args) {
@@ -59,7 +60,8 @@ TVM_REGISTER_OP("tl.infinity")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "infinity")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", infinity_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", infinity_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", infinity_op);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/operator.cc b/src/op/operator.cc
index b751559c70611cb0e4c6db6b02efc812b9e9e4df..302ee3e37443d4ccf7655ecb26577848e8afd3e4 100644
--- a/src/op/operator.cc
+++ b/src/op/operator.cc
@@ -24,16 +24,14 @@ using namespace tir;
  *
  * @param call The TIR Call whose operator and arguments will be used to build
  * the TileOperator.
- * @param vmap Buffer mapping passed through to the builder to resolve buffer
- * references.
  * @return TileOperator The constructed TileOperator, or a default (empty)
  * TileOperator if no builder exists.
  */
-TileOperator ParseOperator(Call call, BufferMap vmap) {
+TileOperator ParseOperator(Call call) {
   auto op_map = Op::GetAttrMap<OpBuilderFunc>("TLOpBuilder");
   Op op = call->op.as<Op>().value();
   if (op_map.count(op)) {
-    auto tile_op = op_map[op](call->args, vmap);
+    auto tile_op = op_map[op](call->args);
     ICHECK(tile_op.defined());
     return tile_op;
   }
@@ -48,14 +46,13 @@ TileOperator ParseOperator(Call call, BufferMap vmap) {
  * Otherwise returns a default-constructed (empty) TileOperator.
  *
  * @param stmt TIR statement to inspect; expected to be an Evaluate of a Call.
- * @param vmap Mapping of buffer variables used when building the operator.
  * @return TileOperator Parsed operator on success, or a default (empty)
  * TileOperator if `stmt` is not an Evaluate(Call).
  */
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap) {
+TileOperator ParseOperator(Stmt stmt) {
   if (stmt.as<Evaluate>() && stmt.as<EvaluateNode>()->value.as<CallNode>()) {
     auto call = stmt.as<EvaluateNode>()->value.as<CallNode>();
-    return ParseOperator(tvm::ffi::GetRef<Call>(call), vmap);
+    return ParseOperator(tvm::ffi::GetRef<Call>(call));
   }
   return TileOperator();
 }
diff --git a/src/op/operator.h b/src/op/operator.h
index 628b83b248c2ea42f9713fe4d679b85b8fb0dc46..c246864e40291cecd0df71f43233474ec4d7ceca 100644
--- a/src/op/operator.h
+++ b/src/op/operator.h
@@ -39,6 +39,9 @@ struct LowerArgs {
   AddWorkspaceCallback AddWorkspace;
   LayoutMap layout_map;
   Map<Buffer, Buffer> buffer_remap;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 struct LayoutInferArgs {
@@ -48,6 +51,9 @@ struct LayoutInferArgs {
   arith::Analyzer *analyzer;
   bool buffer_oob = false;
   Map<Buffer, Buffer> buffer_remap;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 class TileOperator;
@@ -72,23 +78,20 @@ public:
 
 Var GetVarFromAccessPtr(const PrimExpr &expr);
 
-TileOperator ParseOperator(Call call, BufferMap vmap);
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap);
+TileOperator ParseOperator(Call call);
+TileOperator ParseOperator(Stmt stmt);
 
-using OpBuilderFunc =
-    ffi::TypedFunction<TileOperator(Array<PrimExpr>, BufferMap)>;
+using OpBuilderFunc = ffi::TypedFunction<TileOperator(Array<PrimExpr>)>;
 
-#define TIR_REGISTER_TL_OP(Entry, OpName)                                      \
+#define TIR_REGISTER_TL_TILE_OP(Entry, OpName)                                 \
   const Op &Entry::Get() {                                                     \
-    static const Op &op = Op::Get("tl." #OpName);                              \
+    static const Op &op = Op::Get("tl.tileop." #OpName);                       \
     return op;                                                                 \
   }                                                                            \
-  TVM_REGISTER_OP("tl." #OpName)                                               \
+  TVM_REGISTER_OP("tl.tileop." #OpName)                                        \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)             \
-      .set_attr<OpBuilderFunc>("TLOpBuilder",                                  \
-                               [](Array<PrimExpr> args, BufferMap vmap) {      \
-                                 return Entry(args, vmap);                     \
-                               })
+      .set_attr<OpBuilderFunc>(                                                \
+          "TLOpBuilder", [](Array<PrimExpr> args) { return Entry(args); })
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 81777aa53f8ac02e8949e8d3c813b161a7db984f..dbc6ea8e24539a21756371ac456fabf44f2dcb7b 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -182,6 +182,34 @@ TileOperator ParallelOpNode::Clone() const {
   return ParallelOp(op);
 }
 
+void ParallelOpNode::ExpandLetBindings(
+    const Map<Var, PrimExpr> &let_var_to_expr) {
+  if (let_var_to_expr.empty())
+    return;
+
+  // Helper function to recursively find BufferLoads through let bindings
+  std::function<void(const PrimExpr &)> expand = [&](const PrimExpr &expr) {
+    PostOrderVisit(expr, [&](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (bl->buffer.scope() == "local.fragment" &&
+            !indice_map_.count(bl->buffer)) {
+          indice_map_.Set(bl->buffer, bl->indices);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr.count(var)) {
+          expand(let_var_to_expr[var]);
+        }
+      }
+    });
+  };
+
+  // Scan all let bindings
+  for (const auto &[var, expr] : let_var_to_expr) {
+    expand(expr);
+  }
+}
+
 Stmt ParallelOpNode::Lower(const LowerArgs &T,
                            arith::Analyzer *analyzer) const {
   return root_;
@@ -214,6 +242,12 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
                                       InferLevel level) const {
   if (loop_layout_.defined())
     return {};
+
+  // Expand let bindings to find fragment buffer accesses
+  if (!T.let_var_to_expr.empty()) {
+    const_cast<ParallelOpNode *>(this)->ExpandLetBindings(T.let_var_to_expr);
+  }
+
   if (level == InferLevel::kStrict) {
     LayoutMap results;
     // Deduce buffers that should be complicated replicated.
@@ -252,17 +286,18 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
           forward_vars.push_back(
               IterVar(Range(0, s), Var(), IterVarType::kDataPar));
         }
-        Array<PrimExpr> forward_index;
-        for (const auto &iv : forward_vars) {
-          forward_index.push_back(iv->var);
-        }
         Var rep;
         auto rep_iter =
             IterVar({0, T.thread_bounds->extent}, rep, IterVarType::kDataPar);
 
+        // Use default fragment indexing (single output dim) to
+        // stay consistent with other ops (e.g., ReduceOp), and
+        // bind the thread range for comparability.
         const PrimExpr &forward_thread = rep;
-        results.Set(buffer, Fragment(forward_vars, forward_index,
-                                     forward_thread, rep_iter));
+        auto frag = Fragment(forward_vars, /*forward_index=*/{}, forward_thread,
+                             rep_iter)
+                        ->BindThreadRange(T.thread_bounds);
+        results.Set(buffer, frag);
       }
     }
     return results;
@@ -452,8 +487,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
       // As the pass will do post processing to the layout
       auto maybe_remapped_root_ =
           IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
-      int vector_size = GetVectorizeSize(maybe_remapped_root_);
-
+      int vector_size = GetVectorizeSize(maybe_remapped_root_, T.analyzer);
       DLOG(INFO) << "[PlanLoopPartition] vector_size = " << vector_size << '\n';
 
       PrimExpr loop_total_size = 1;
@@ -562,6 +596,16 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   } else {
     return {};
   }
+  // check loop_layout_ is injective
+  auto injective_res = loop_layout_->DetectInjective();
+  if (!injective_res->errors.empty()) {
+    std::ostringstream oss;
+    oss << "Loop layout is not injective: " << loop_layout_->DebugOutput()
+        << '\n'
+        << "  errors: " << injective_res->errors << '\n'
+        << "  loop AST: " << root_;
+    throw LoopLayoutInjectiveException(oss.str());
+  }
 
   PrimExpr loop_thread_extent = loop_layout_->ThreadExtent();
 
diff --git a/src/op/parallel.h b/src/op/parallel.h
index 8ebd7366e10ea9b51293115126b7e863b5964134..88dd1debf7697e58af4535e524a2865510cce821 100644
--- a/src/op/parallel.h
+++ b/src/op/parallel.h
@@ -24,15 +24,6 @@ namespace tl {
 
 using namespace tir;
 
-class LayoutConflictException : public std::exception {
-public:
-  const char *what() const noexcept override { return msg_.c_str(); }
-  LayoutConflictException(const std::string &msg) : msg_(msg) {}
-
-private:
-  std::string msg_;
-};
-
 bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
                            Array<PrimExpr> small_frag_indices,
                            Array<PrimExpr> large_frag_indices,
@@ -114,6 +105,10 @@ private:
   void AddPredicate(const PrimExpr &expr) const {
     predicate_ = predicate_.defined() ? And(expr, predicate_.value()) : expr;
   }
+  // Expand let bindings to find fragment buffer accesses and add them to
+  // indice_map_. This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0],
+  // ...)
+  void ExpandLetBindings(const Map<Var, PrimExpr> &let_var_to_expr);
 
   // Allow ParallelLoopNestVisitor to access private members.
   friend class ParallelLoopNestVisitor;
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index 05dad48fc8b748c1b478ef6a194fcf9240ab205f..4458a4f51864a13d8680416f0f16d4669b047fe6 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -14,60 +14,24 @@
 #include "../op/parallel.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
-#include "region.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/tir/stmt.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-// Normalize an argument (BufferRegion/BufferLoad/tl.region)
-// to BufferRegion so Reduce can uniformly consume regions.
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
-  // Case 3: Call nodes (only tl.region)
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-  }
-
-  LOG(FATAL) << "Unsupported argument for BufferRegion in reduce: " << arg;
-  throw; // Unreachable
-}
-
-ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
+ReduceOp::ReduceOp(Array<PrimExpr> args) {
   ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
-  // Accept BufferRegion/BufferLoad/tl.region for src/dst
-  node->srcRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->dstRegion_ = NormalizeToBufferRegion(args[1], vmap);
+  // Accept BufferRegion/BufferLoad for src/dst
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
   node->src = node->srcRegion_->buffer;
   node->dst = node->dstRegion_->buffer;
   std::string reduce_type = args[2].as<StringImm>().value()->value;
@@ -231,6 +195,7 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto dst_scope = this->dst.scope();
 
   if (src_scope == "local.fragment" && dst_scope == "local.fragment") {
+
     Buffer src_buffer = get_buffer(this->src);
     Buffer dst_buffer = get_buffer(this->dst);
     Fragment src_layout = T.layout_map[this->src].as<Fragment>().value();
@@ -513,12 +478,22 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(ReduceOp, reduce)
+TIR_REGISTER_TL_TILE_OP(ReduceOp, reduce)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
+// Normalize "Buffer" to BufferRegion. Use the shape of the buffer as the
+// ranges.
+static BufferRegion ConvertBufferToBufferRegion(const Buffer &buf) {
+  Array<Range> ranges;
+  for (PrimExpr extent : buf->shape) {
+    ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
+  }
+  return BufferRegion(buf, ranges);
+}
+
+CumSumOp::CumSumOp(Array<PrimExpr> args) {
   /// CumSum constructor arguments:
   /// - src: input buffer
   /// - dst: output buffer
@@ -526,11 +501,19 @@ CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
   /// - reverse: whether to cumsum in reverse order
   CHECK_EQ(args.size(), 4);
   ObjectPtr<CumSumOpNode> node = tvm::ffi::make_object<CumSumOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  // node->src = vmap[GetVarFromAccessPtr(args[0])];
+  // node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src = node->srcRegion_->buffer;
+  node->dst = node->dstRegion_->buffer;
   node->dim = args[2].as<IntImm>().value()->value;
   node->reverse = args[3].as<Bool>().value();
-  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()));
+  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()))
+      << "The dim of cumsum should be less than the number of dimensions. Got "
+         "dim="
+      << node->dim << ", but src has " << node->src->shape.size() << " dims.";
+
   data_ = std::move(node);
 }
 
@@ -545,19 +528,29 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     std::stringstream ss;
     auto threads = T.thread_bounds->extent;
     Array<PrimExpr> args;
-    int ndim = static_cast<int>(src->shape.size());
+
+    // Build access pointers from regions locally
+    PrimExpr srcPtr = MakeAccessPtrFromRegion(srcRegion_, 1);
+    PrimExpr dstPtr = MakeAccessPtrFromRegion(dstRegion_, 2);
+
+    // Use region extents instead of buffer shape for correct slice handling
+    Array<PrimExpr> src_extents;
+    for (const auto &range : srcRegion_->region) {
+      src_extents.push_back(range->extent);
+    }
+    int ndim = static_cast<int>(src_extents.size());
+
     if (ndim == 1) {
       ICHECK_EQ(dim, 0) << "Cumulative sum over a 1D buffer only supports dim "
                            "= 0.";
       ss << "tl::CumSum1D<" << threads << ", " << (reverse ? "true" : "false")
          << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0]};
     } else if (ndim == 2) {
       ss << "tl::CumSum2D<" << threads << ", " << dim << ", "
          << (reverse ? "true" : "false") << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0], src->shape[1]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0],
+              src_extents[1]};
     } else {
       LOG(FATAL) << "CumSum currently supports only 1D or 2D buffers, got "
                  << ndim << "D.";
@@ -576,7 +569,7 @@ LayoutMap CumSumOpNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(CumSumOp, cumsum)
+TIR_REGISTER_TL_TILE_OP(CumSumOp, cumsum)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/reduce.h b/src/op/reduce.h
index 3b124a4d302e2ca94125de59e46cc2f5dd660583..cab3835e19899ba65e9f0e68237b16a813d640f6 100644
--- a/src/op/reduce.h
+++ b/src/op/reduce.h
@@ -125,7 +125,7 @@ class ReduceOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(ReduceOp, TileOperator,
                                              ReduceOpNode);
-  TVM_DLL ReduceOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL ReduceOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
@@ -133,8 +133,10 @@ public:
 class CumSumOpNode : public TileOperatorNode {
 public:
   tir::Buffer src, dst; ///< Source and destination buffers
-  int dim;              ///< Dimension along which to compute cumulative sum
-  bool reverse;         ///< Whether to compute in reverse order
+  // Optional: keep the original regions used to construct this op
+  BufferRegion srcRegion_, dstRegion_;
+  int dim;      ///< Dimension along which to compute cumulative sum
+  bool reverse; ///< Whether to compute in reverse order
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.CumSumOp", CumSumOpNode,
                                     TileOperatorNode);
 
@@ -143,6 +145,8 @@ public:
     refl::ObjectDef<CumSumOpNode>()
         .def_ro("src", &CumSumOpNode::src)
         .def_ro("dst", &CumSumOpNode::dst)
+        .def_ro("srcRegion", &CumSumOpNode::srcRegion_)
+        .def_ro("dstRegion", &CumSumOpNode::dstRegion_)
         .def_ro("dim", &CumSumOpNode::dim)
         .def_ro("reverse", &CumSumOpNode::reverse);
   }
@@ -159,7 +163,7 @@ class CumSumOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(CumSumOp, TileOperator,
                                              CumSumOpNode);
-  TVM_DLL CumSumOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL CumSumOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/region.cc b/src/op/region.cc
index e4984af13642950e217250177de58ef89f51c4c3..25e78eba89ef7a112976e347b8919053ef34bc9f 100644
--- a/src/op/region.cc
+++ b/src/op/region.cc
@@ -1,7 +1,14 @@
 /*!
  * \file tl/op/region.cc
- * \brief Define region operator.
+ * \brief Define region operator (bridge to carry BufferRegion via Call args).
  *
+ * Notes:
+ * - BufferLoad/Ramp cannot represent a general PrimExpr as a vector lane
+ *   count. Dynamic extents like (H1 - H0) cannot be encoded as
+ *   Ramp(lanes = H1 - H0), and lowering BufferRegion to BufferLoad loses the
+ *   explicit extent information.
+ * - tl.region carries both mins and extents in Call args and lets the backend
+ *   reconstruct a BufferRegion faithfully.
  */
 
 #include "region.h"
@@ -11,27 +18,7 @@ namespace tvm {
 namespace tl {
 using namespace tir;
 
-/**
- * @brief Construct a RegionOp from TL operator arguments.
- *
- * Parses the TL `region` operator call arguments to populate the RegionOpNode:
- * - Expects args[0] to be a `BufferLoad` whose `indices` are the per-dimension
- * minima.
- * - args[1] must be a constant integer used as the access mask.
- * - args[2 + i] provides the extent for dimension `i`.
- *
- * The constructor validates that the number of load indices equals `args.size()
- * - 2` and will abort via ICHECK on mismatch or if args[0] is not a
- * `BufferLoad`.
- *
- * Parameters:
- * - args: TL operator call arguments in the form
- *     [BufferLoad(min_i...), access_mask, extent_0, extent_1, ...,
- * extent_{n-1}] where n = number of dimensions.
- * - vmap: BufferMap passed through by the caller (not documented here as a
- * generic utility).
- */
-RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
+RegionOp::RegionOp(Array<PrimExpr> args) {
   size_t n = args.size();
   size_t ndim = n - 2;
   auto load = args[0].as<BufferLoadNode>();
@@ -39,10 +26,24 @@ RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
   ICHECK(load->indices.size() == ndim)
       << "load->indices.size() = " << load->indices << " ndim = " << ndim;
   Array<Range> ranges;
+  // Rebuild per-axis ranges from mins (BufferLoad indices) and provided extents
   for (size_t i = 0; i < ndim; i++) {
-    PrimExpr min = load->indices[i];
+    PrimExpr index = load->indices[i];
     PrimExpr extent = args[2 + i];
-    ranges.push_back(Range::FromMinExtent(min, extent));
+    if (const auto *ramp = index.as<RampNode>()) {
+      const auto *stride_imm = ramp->stride.as<IntImmNode>();
+      ICHECK(stride_imm && stride_imm->value == 1)
+          << "RegionOp expects stride-1 Ramp for index";
+      if (const auto *lanes_imm = ramp->lanes.as<IntImmNode>()) {
+        if (const auto *ext_imm = extent.as<IntImmNode>()) {
+          ICHECK_EQ(lanes_imm->value, ext_imm->value)
+              << "Ramp lanes and provided extent must match";
+        }
+      }
+      ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+    } else {
+      ranges.push_back(Range::FromMinExtent(index, extent));
+    }
   }
   ObjectPtr<RegionOpNode> node = tvm::ffi::make_object<RegionOpNode>();
   node->buffer_ = load->buffer;
@@ -51,26 +52,11 @@ RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a copy of this RegionOpNode and return it as a TileOperator.
- *
- * @return TileOperator A new TileOperator that owns a copied RegionOpNode.
- */
 TileOperator RegionOpNode::Clone() const {
   auto op = tvm::ffi::make_object<RegionOpNode>(*this);
   return RegionOp(op);
 }
 
-/**
- * @brief Check whether the region spans the entire underlying buffer.
- *
- * Returns true if for every dimension the range minimum is zero and the
- * range extent is structurally equal to the corresponding buffer shape
- * dimension. Otherwise returns false.
- *
- * @return true if the region covers the full buffer in all dimensions; false
- * otherwise.
- */
 bool RegionOpNode::IsFullRegion() const {
   for (size_t i = 0; i < ranges_.size(); i++) {
     if (!is_zero(ranges_[i]->min))
@@ -81,39 +67,16 @@ bool RegionOpNode::IsFullRegion() const {
   return true;
 }
 
-/**
- * @brief Lower the region operator to a TIR statement.
- *
- * Lowers this RegionOpNode into a TIR Stmt by delegating to the operator's
- * evaluation path (currently `Evaluate(0)`).
- *
- * @param T Lowering context (provides buffers, producers/consumers and other
- *          environment required for lowering).
- * @param analyzer Optional arithmetic analyzer used for simplification during
- *                 lowering.
- * @return Stmt The lowered TIR statement representing this region operation.
- */
 Stmt RegionOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return Evaluate(0);
 }
 
-/**
- * @brief Infers data layout for the region operator.
- *
- * This operator does not provide any layout inference; the function always
- * returns an empty LayoutMap regardless of the provided arguments or inference
- * level.
- *
- * @param T Layout inference arguments (ignored).
- * @param level Inference granularity level (ignored).
- * @return LayoutMap Empty map indicating no inferred layouts.
- */
 LayoutMap RegionOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
   return {};
 }
 
-TIR_REGISTER_TL_OP(RegionOp, region)
+TIR_REGISTER_TL_TILE_OP(RegionOp, region)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
diff --git a/src/op/region.h b/src/op/region.h
index e5c478bffc69311307fdff5cb3ae9616bde5d3ab..24399f7ab551d80c7f64a91c94061063e0520913 100644
--- a/src/op/region.h
+++ b/src/op/region.h
@@ -1,74 +1,36 @@
 /*!
- * \file tl/op/op.h
- * \brief Tile library operations.
+ * \file tl/op/region.h
+ * \brief Tile memory region descriptor op (bridge to carry BufferRegion via
+ * Call args).
  *
+ * Why tl.region instead of passing BufferRegion directly?
+ *
+ * - While TIR can represent a BufferRegion, when a BufferRegion is passed as a
+ *   call argument through call_intrin/FFI, the Python->C++ conversion lowers it
+ *   to a BufferLoad(indices). To encode an interval inside indices, the FFI
+ *   typically uses Ramp(base, stride, lanes) to represent a contiguous slice.
+ * - Ramp(lanes) may only be a constant or vscale*k (scalable vector). A general
+ *   PrimExpr (e.g., H1 - H0) is not allowed as lanes, so dynamic extents would
+ *   make the lowered BufferLoad invalid.
+ * - Moreover, BufferLoad only carries indices, not per-axis extents. Downstream
+ *   tile operators (e.g., tl.copy, tl.reduce) that require both min and extent
+ *   cannot losslessly recover dynamic extents from a BufferLoad alone.
+ *
+ * tl.region is a small transport-only op that solves this:
+ * - The frontend packs buffer + mins (from BufferLoad.indices) + extents into
+ *   Call args, allowing dynamic extents to be expressed explicitly.
+ * - The backend (NormalizeToBufferRegion) reconstructs a BufferRegion from the
+ *   tl.region call without losing information.
+ * - The op itself carries no semantics in Lower/InferLayout and is only used as
+ *   a bridge for argument passing.
  */
 
 #ifndef TVM_TL_OP_REGION_H_
 #define TVM_TL_OP_REGION_H_
 
 #include "./operator.h"
-#include <tvm/arith/analyzer.h>
-#include <tvm/ir/op.h>
-#include <tvm/target/target.h>
 #include <tvm/tir/buffer.h>
 
-/**
- * Tile operator representing a memory region (buffer + ranges) used by TL
- * passes.
- *
- * Encapsulates the target tir::Buffer, the region extents as an Array<Range>,
- * and an access mask that indicates permitted or intended accesses for lowering
- * and layout inference.
- */
-
-/**
- * Lower this RegionOp into a TIR statement representing the region access.
- *
- * @param T Lowering-time arguments (e.g., loop/build context and value
- * mappings).
- * @param analyzer Arithmetic analyzer used to simplify and reason about
- * expressions.
- * @return A tir::Stmt that implements the region access/mutation described by
- * this operator.
- */
-
-/**
- * Infer the layout mapping for this region operator.
- *
- * Produces a LayoutMap describing how loop/axis indices map to buffer axes for
- * layout-aware scheduling and subsequent operators.
- *
- * @param T Layout inference arguments (e.g., input layouts and shapes).
- * @param level The inference detail level to use.
- * @return A LayoutMap describing inferred mappings for the operator.
- */
-
-/**
- * Return true when this RegionOp represents the full buffer region (i.e.,
- * ranges cover the entire buffer extent).
- */
-
-/**
- * Create a shallow copy of this operator as a TileOperator handle.
- *
- * @return A TileOperator that references a cloned RegionOpNode.
- */
-
-/**
- * Construct a RegionOp from argument expressions and a buffer map.
- *
- * @param args Positional expressions used to instantiate the operator
- * (semantics depend on how RegionOp is invoked in TL pipelines).
- * @param vmap Mapping from Buffer to replacement Buffer or buffer metadata used
- * during creation.
- */
-
-/**
- * Return the global Op registration for RegionOp.
- *
- * @return Reference to the registered tvm::Op describing the RegionOp.
- */
 namespace tvm {
 namespace tl {
 
@@ -80,6 +42,12 @@ public:
   Array<Range> ranges_;
   int access_mask_;
 
+  /*!
+   * access_mask_ encodes the intended access type when the region is used as
+   * an argument to tile operators: 1=read, 2=write, 3=read-write. The mask is
+   * transport metadata only and does not affect lowering.
+   */
+
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.RegionOp", RegionOpNode,
                                     TileOperatorNode);
 
@@ -107,8 +75,13 @@ class RegionOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(RegionOp, TileOperator,
                                              RegionOpNode);
-  TVM_DLL RegionOp(Array<PrimExpr> args, BufferMap vmap);
-
+  /*!
+   * Build a RegionOp from call arguments:
+   * - args[0]: BufferLoad whose indices are per-axis minima.
+   * - args[1]: Integer access mask (1=r, 2=w, 3=rw).
+   * - args[2 + i]: Extent of axis i (supports dynamic PrimExpr).
+   */
+  TVM_DLL RegionOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
index bb63c8dc0b738855439036b43658c76f3de8933f..8b6ff61ba3725928958c1bfa1c785264e74ee7b9 100644
--- a/src/op/tcgen5_meta.h
+++ b/src/op/tcgen5_meta.h
@@ -15,16 +15,19 @@ using runtime::DataType;
 
 struct TCGEN5MMAMeta {
   int atom_m, atom_n, atom_k;
+  bool enable_ws, enable_2cta;
 };
 
 inline std::pair<bool, TCGEN5MMAMeta>
 GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
 // TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
 #define FAIL                                                                   \
-  return { false, TCGEN5MMAMeta{0, 0, 0} }
-#define SUCCESS(atom_m, atom_n, atom_k)                                        \
   return {                                                                     \
-    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k }                             \
+    false, TCGEN5MMAMeta { 0, 0, 0, false, false }                             \
+  }
+#define SUCCESS(atom_m, atom_n, atom_k, use_ws, use_2cta)                      \
+  return {                                                                     \
+    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k, use_ws, use_2cta }           \
   }
   std::vector<int> ws_valid_atom_ns = {256, 128, 64};
   if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
@@ -34,39 +37,50 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
     if (M % 128 == 0) {
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 16);
+          SUCCESS(128, atom_n, 16, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 16);
+          SUCCESS(64, atom_n, 16, true, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 16);
+          SUCCESS(32, atom_n, 16, true, false);
       FAIL;
     } else {
       FAIL;
     }
-  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e5m2()) &&
-             (c_dtype.is_float() && c_dtype.bits() == 32)) {
+  } else if ((ab_dtype.is_float8() || ab_dtype.is_float6_e2m3fn() ||
+              ab_dtype.is_float6_e3m2fn() || ab_dtype.is_float4_e2m1fn()) &&
+             ((c_dtype.is_float() && c_dtype.bits() == 32) ||
+              (c_dtype.is_float16() && c_dtype.bits() == 16))) {
     if (K % 32 != 0)
       FAIL;
     if (M % 128 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, true, false);
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32);
+          SUCCESS(128, atom_n, 32, false, true);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 32);
+          SUCCESS(64, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, false, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 32);
+          SUCCESS(32, atom_n, 32, true, false);
       FAIL;
     } else {
       FAIL;
diff --git a/src/op/utils.cc b/src/op/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e56ae8c7a51ef519e047fbb50f857b2f4673bc0
--- /dev/null
+++ b/src/op/utils.cc
@@ -0,0 +1,96 @@
+/*!
+ * \file tl/op/utils.cc
+ * \brief Common utilities implementation for TL ops.
+ */
+
+#include "utils.h"
+
+#include <tvm/tir/builtin.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+BufferRegion NormalizeToBufferRegion(const PrimExpr &arg) {
+  // Case 1: Already a BufferRegion
+  if (arg->IsInstance<BufferRegionNode>()) {
+    return Downcast<BufferRegion>(arg);
+  }
+
+  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
+  // extent=1)
+  if (const auto *load = arg.as<BufferLoadNode>()) {
+    Array<Range> ranges;
+    for (const PrimExpr &index : load->indices) {
+      if (const auto *ramp = index.as<RampNode>()) {
+        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
+        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
+            << "Only stride-1 Ramp is supported in region conversion";
+        ICHECK(ramp->lanes.as<IntImmNode>())
+            << "Scalable vector lanes not supported in region conversion";
+        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+      } else {
+        ranges.push_back(Range::FromMinExtent(index, 1));
+      }
+    }
+    return BufferRegion(load->buffer, ranges);
+  }
+
+  // Case 3: tl.region(...) — reconstruct via RegionOp (bridge)
+  if (const auto *call = arg.as<CallNode>()) {
+    if (call->op.same_as(RegionOp::Get())) {
+      RegionOp region(call->args);
+      return BufferRegion(region->GetBuffer(), region->GetRanges());
+    }
+    LOG(FATAL) << "Unsupported argument for BufferRegion (expect "
+                  "BufferLoad/BufferRegion/tl.region): "
+               << arg;
+  }
+
+  LOG(FATAL) << "Unsupported argument for BufferRegion: " << arg;
+  throw; // Unreachable
+}
+
+PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region, int rw_mask,
+                                 bool require_2d) {
+  Buffer buf = region->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+  if (require_2d) {
+    ICHECK(ndim >= 2) << "Expect buffers with at least 2 dims";
+  }
+
+  PrimExpr offset, extent;
+  if (ndim == 1) {
+    // 1D: straightforward
+    auto axis = region->region[0];
+    offset = axis->min;
+    extent = axis->extent;
+  } else {
+    // Compute row-major strides
+    std::vector<PrimExpr> strides(ndim);
+    PrimExpr one = make_const(buf->shape[0].dtype(), 1);
+    PrimExpr cur = one;
+    for (int i = ndim - 1; i >= 0; --i) {
+      strides[i] = cur;
+      cur = cur * buf->shape[i];
+    }
+    // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
+    offset = make_const(buf->shape[0].dtype(), 0);
+    for (int i = 0; i < ndim - 2; ++i) {
+      offset = offset + region->region[i]->min * strides[i];
+    }
+    // Extent: last two extents product (elements)
+    extent =
+        region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
+  }
+
+  // ptype and return handle
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/utils.h b/src/op/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d386b1a58d74b22bddb24f52b2d05120d316890d
--- /dev/null
+++ b/src/op/utils.h
@@ -0,0 +1,35 @@
+/*!
+ * \file tl/op/utils.h
+ * \brief Common utilities for TL ops.
+ */
+
+#ifndef TVM_TL_OP_UTILS_H_
+#define TVM_TL_OP_UTILS_H_
+
+#include "./operator.h"
+#include "region.h"
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Normalize an argument (BufferRegion/BufferLoad/tl.region)
+// to BufferRegion so ops can uniformly consume regions.
+// Note: tvm_access_ptr is no longer supported here.
+TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg);
+
+// Build a tvm_access_ptr(handle) from a BufferRegion.
+// - If `require_2d` is true, checks buffer ndim >= 2.
+// - For 1D regions (when allowed), offset=min, extent=extent.
+// - For ndim >= 2, offset sums all but last two dims using row-major strides,
+//   extent is product of the last two extents.
+TVM_DLL PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
+                                         int rw_mask, bool require_2d = false);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_UTILS_H_
diff --git a/src/runtime/error_helpers.cc b/src/runtime/error_helpers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..903f8b1d9ec8a5dafb6a093609848c5dbd4832b1
--- /dev/null
+++ b/src/runtime/error_helpers.cc
@@ -0,0 +1,222 @@
+/*
+ * Helper functions for nicer runtime error messages.
+ */
+#include "error_helpers.h"
+
+#include <tvm/ffi/c_api.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/device_api.h>
+
+#include <sstream>
+#include <string>
+
+namespace tvm {
+namespace tl {
+
+// Return non-zero so that tvm_call_packed sites treat it as failure and return
+// -1.
+static int DTypeMismatch(const tvm::ffi::String &kernel_name,
+                         const tvm::ffi::String &buffer_name,
+                         int64_t actual_code, int64_t actual_bits,
+                         int64_t actual_lanes, int64_t expect_code,
+                         int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << "kernel " << std::string(kernel_name) << " input "
+     << std::string(buffer_name) << " dtype expected " << expect << ", but got "
+     << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+// Variant without names, to avoid passing extra raw strings through packed
+// args.
+static int DTypeMismatchNoNames(int64_t actual_code, int64_t actual_bits,
+                                int64_t actual_lanes, int64_t expect_code,
+                                int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << "dtype mismatch: expected " << expect << ", but got " << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+// Register packed versions, following the design in runtime.cc
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+
+  // Packed: __tvm_error_dtype_mismatch(kernel_name, buffer_name,
+  //                                    actual_code, actual_bits, actual_lanes,
+  //                                    expect_code, expect_bits, expect_lanes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_dtype_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 8) << "Expected 8 args: kernel, buffer, "
+                                    "actual_code, actual_bits, actual_lanes, "
+                                 << "expect_code, expect_bits, expect_lanes";
+
+        auto kernel_name = args[0].cast<tvm::ffi::String>();
+        auto buffer_name = args[1].cast<tvm::ffi::String>();
+        int64_t actual_code = args[2].cast<int64_t>();
+        int64_t actual_bits = args[3].cast<int64_t>();
+        int64_t actual_lanes = args[4].cast<int64_t>();
+        int64_t expect_code = args[5].cast<int64_t>();
+        int64_t expect_bits = args[6].cast<int64_t>();
+        int64_t expect_lanes = args[7].cast<int64_t>();
+
+        // Reuse the helper to format the message
+        (void)DTypeMismatch(kernel_name, buffer_name, actual_code, actual_bits,
+                            actual_lanes, expect_code, expect_bits,
+                            expect_lanes);
+        // Provide a return value for completeness, then signal the error
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_ndim_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_ndim_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " ndim expected " << expect << ", but got "
+           << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_byte_offset_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_byte_offset_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " byte_offset expected " << expect
+           << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_device_type_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_device_type_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        const char *expect_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(expect));
+        const char *got_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(got));
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " device_type expected " << expect_str
+           << ", but got " << got_str;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_null_ptr,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3)
+            << "__tvm_error_null_ptr(kernel, buffer, field)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " expected non-NULL, but got NULL";
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_expect_eq,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 5)
+            << "__tvm_error_expect_eq(kernel, buffer, field, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        int64_t expect = args[3].cast<int64_t>();
+        int64_t got = args[4].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field) << " expected "
+           << expect << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String [, reason:String]
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_constraint_violation,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3 || args.size() == 4)
+            << "__tvm_error_constraint_violation(kernel, buffer, field[, "
+               "reason])";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::string reason;
+        if (args.size() == 4) {
+          reason = args[3].cast<tvm::ffi::String>();
+        }
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " constraint not satisfied";
+        if (!reason.empty()) {
+          os << ": " << reason;
+        }
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // Legacy typed registrations for backward compatibility
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch",
+                        &tvm::tl::DTypeMismatch);
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch2",
+                        &tvm::tl::DTypeMismatchNoNames);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/runtime/error_helpers.h b/src/runtime/error_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..6620d837e9031d795a16ebc40c1a81d709717904
--- /dev/null
+++ b/src/runtime/error_helpers.h
@@ -0,0 +1,27 @@
+/*!
+ * \file tl/runtime/error_helpers.h
+ * \brief Error helper FFI names for TileLang runtime.
+ */
+
+#ifndef TVM_TL_RUNTIME_ERROR_HELPERS_H_
+#define TVM_TL_RUNTIME_ERROR_HELPERS_H_
+
+namespace tvm {
+namespace tl {
+
+// Error helper packed functions
+constexpr const char *tvm_error_dtype_mismatch = "__tvm_error_dtype_mismatch";
+constexpr const char *tvm_error_ndim_mismatch = "__tvm_error_ndim_mismatch";
+constexpr const char *tvm_error_byte_offset_mismatch =
+    "__tvm_error_byte_offset_mismatch";
+constexpr const char *tvm_error_device_type_mismatch =
+    "__tvm_error_device_type_mismatch";
+constexpr const char *tvm_error_null_ptr = "__tvm_error_null_ptr";
+constexpr const char *tvm_error_expect_eq = "__tvm_error_expect_eq";
+constexpr const char *tvm_error_constraint_violation =
+    "__tvm_error_constraint_violation";
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_RUNTIME_ERROR_HELPERS_H_
diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc
index a00786e25b44d00eb54a5d79f77b72e3494f1ee5..b2a7127d264a709626e3d5ab5ae7527a5f1c2a13 100644
--- a/src/runtime/runtime.cc
+++ b/src/runtime/runtime.cc
@@ -13,6 +13,12 @@
 namespace tvm {
 namespace tl {
 
+#if 1
+// Thread-local storage for restoring the L2 persisting cache limit
+static thread_local size_t __tl_prev_persisting_l2_cache_size = 0;
+static thread_local bool __tl_prev_persisting_l2_cache_saved = false;
+#endif
+
 #if (CUDA_MAJOR_VERSION >= 12)
 template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
   std::stringstream ss;
@@ -91,19 +97,21 @@ struct TensorMapArgs {
 // set device api
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def_packed("tvm_tensormap_create_tiled", [](PackedArgs args,
-                                                                Any *ret) {
-    TensorMapArgs T = TensorMapArgs::Extract(args);
-    CUresult result = cuTensorMapEncodeTiled(
-        T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
-        T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave, T.swizzle,
-        T.l2Promotion, T.oobFill);
-    if (result != CUDA_SUCCESS) {
-      LOG_FATAL << "Failed to initialize the TMA descriptor " << result << '\n'
-                << T.ToDebugString();
-    }
-    *ret = static_cast<int>(result);
-  });
+  // Register using the canonical names defined in runtime.h
+  refl::GlobalDef().def_packed(
+      tl::tvm_tensormap_create_tiled, [](PackedArgs args, Any *ret) {
+        TensorMapArgs T = TensorMapArgs::Extract(args);
+        CUresult result = cuTensorMapEncodeTiled(
+            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
+            T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
+            T.swizzle, T.l2Promotion, T.oobFill);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
+                    << '\n'
+                    << T.ToDebugString();
+        }
+        *ret = static_cast<int>(result);
+      });
 }
 
 struct TensorMapIm2ColArgs {
@@ -183,7 +191,7 @@ struct TensorMapIm2ColArgs {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed(
-      "tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
+      tl::tvm_tensormap_create_im2col, [](PackedArgs args, Any *ret) {
         TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
         CUresult result = cuTensorMapEncodeIm2col(
             T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
@@ -201,5 +209,141 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
+//
+// CUDA L2 Persisting Cache Access Policy Window helpers.
+// Exposed as TVM FFI packed functions similar to TMA initialization.
+//
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  // Set stream access policy window and adjust persisting L2 cache size
+  // Args:
+  //  [0]: void* base_ptr (required)
+  //  [1]: int64 num_bytes (required)
+  //  [2]: float hit_ratio (optional, default 0.8)
+  //  [3]: void* stream (optional, default 0 => default stream)
+  //  [4]: int64 l2_limit_bytes (optional, default = num_bytes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_set_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        ICHECK(args.size() >= 2) << "Expected at least base_ptr and num_bytes";
+
+        void *base_ptr = args[0].cast<void *>();
+        size_t num_bytes = static_cast<size_t>(args[1].cast<int64_t>());
+        float hit_ratio = 0.8f;
+        if (args.size() >= 3) {
+          // Accept double/float
+          hit_ratio = static_cast<float>(args[2].cast<double>());
+        }
+        CUstream stream = nullptr;
+        if (args.size() >= 4) {
+          stream = reinterpret_cast<CUstream>(args[3].cast<void *>());
+        }
+        size_t l2_limit_bytes = num_bytes;
+        if (args.size() >= 5) {
+          l2_limit_bytes = static_cast<size_t>(args[4].cast<int64_t>());
+        }
+
+        // Clamp requested limit to device capability
+        CUdevice device;
+        CUresult result = cuCtxGetDevice(&device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current CUDA device: " << result;
+        }
+        int max_persisting = 0;
+        result = cuDeviceGetAttribute(
+            &max_persisting, CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
+            device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to query MAX_PERSISTING_L2_CACHE_SIZE: "
+                    << result;
+        }
+        if (max_persisting > 0 &&
+            l2_limit_bytes > static_cast<size_t>(max_persisting)) {
+          l2_limit_bytes = static_cast<size_t>(max_persisting);
+        }
+
+        // Save current limit to restore later
+        size_t init_persisting_l2_cache_size = 0;
+        result = cuCtxGetLimit(&init_persisting_l2_cache_size,
+                               CU_LIMIT_PERSISTING_L2_CACHE_SIZE);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current persisting L2 cache size limit: "
+                    << result;
+        }
+        __tl_prev_persisting_l2_cache_size = init_persisting_l2_cache_size;
+        __tl_prev_persisting_l2_cache_saved = true;
+
+        // Set new limit
+        result =
+            cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE, l2_limit_bytes);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set persisting L2 cache size limit: "
+                    << result;
+        }
+
+        // Apply access policy window to stream
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        stream_attribute.accessPolicyWindow.base_ptr = base_ptr;
+        stream_attribute.accessPolicyWindow.num_bytes = l2_limit_bytes;
+        stream_attribute.accessPolicyWindow.hitRatio = hit_ratio;
+        stream_attribute.accessPolicyWindow.hitProp =
+            CU_ACCESS_PROPERTY_PERSISTING;
+        stream_attribute.accessPolicyWindow.missProp =
+            CU_ACCESS_PROPERTY_STREAMING;
+
+        result = cuStreamSetAttribute(stream,
+                                      CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+                                      &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set stream access policy window: " << result;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+
+  // Reset stream access policy window and restore the previous L2 cache size
+  // Args:
+  //  [0]: void* stream (optional, default 0)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_reset_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        CUstream stream = nullptr;
+        if (args.size() >= 1) {
+          stream = reinterpret_cast<CUstream>(args[0].cast<void *>());
+        }
+
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        // num_bytes = 0 disables the access policy window on the stream
+        stream_attribute.accessPolicyWindow.num_bytes = 0;
+
+        CUresult result = cuStreamSetAttribute(
+            stream, CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+            &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset stream access policy window: "
+                    << result;
+        }
+
+        result = cuCtxResetPersistingL2Cache();
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset persisting L2 cache lines: " << result;
+        }
+
+        if (__tl_prev_persisting_l2_cache_saved) {
+          result = cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE,
+                                 __tl_prev_persisting_l2_cache_size);
+          if (result != CUDA_SUCCESS) {
+            LOG_FATAL << "Failed to restore persisting L2 cache size limit: "
+                      << result;
+          }
+          __tl_prev_persisting_l2_cache_saved = false;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index fb9dfcfdd9dee5eddf88915af2e1b9e064e6015e..4b389fc03e365bd4d9d131ceedb332aa2cdf51be 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -16,7 +16,13 @@ constexpr const char *tvm_tensormap_create_tiled =
 constexpr const char *tvm_tensormap_create_im2col =
     "__tvm_tensormap_create_im2col";
 #endif // (CUDA_MAJOR_VERSION >= 12)
+
+// CUDA stream access policy window helpers
+constexpr const char *tvm_cuda_stream_set_access_policy_window =
+    "__tvm_cuda_stream_set_access_policy_window";
+constexpr const char *tvm_cuda_stream_reset_access_policy_window =
+    "__tvm_cuda_stream_reset_access_policy_window";
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_RUNTIME_RUNTIME_H_
\ No newline at end of file
+#endif //  TVM_TL_RUNTIME_RUNTIME_H_
diff --git a/src/support/ffi_aliases.h b/src/support/ffi_aliases.h
index cbc6fb027121e6d467778564844116d5226f9374..7dbe0b39501e861d040766847d1beb45c82a1f28 100644
--- a/src/support/ffi_aliases.h
+++ b/src/support/ffi_aliases.h
@@ -3,6 +3,7 @@
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/function.h>
 #include <tvm/ffi/memory.h>
 #include <tvm/ffi/optional.h>
 #include <tvm/ffi/string.h>
diff --git a/src/target/codegen_c_host.cc b/src/target/codegen_c_host.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f1a70cef86662eabdeb416b2066c2a9e4957fd4
--- /dev/null
+++ b/src/target/codegen_c_host.cc
@@ -0,0 +1,507 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.cc
+ */
+#include "codegen_c_host.h"
+
+#include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/extra/module.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/target/codegen.h>
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// For escaping strings embedded into generated C sources
+#include "support/str_escape.h"
+
+namespace tvm {
+namespace tl {
+
+CodeGenCHost::CodeGenCHost() {
+  module_name_ = name_supply_->FreshName(tvm::ffi::symbol::tvm_ffi_library_ctx);
+}
+
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts,
+                        bool emit_fwd_func_decl, std::string target_str,
+                        const std::unordered_set<std::string> &devices) {
+  emit_asserts_ = emit_asserts;
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  declared_globals_.clear();
+  decl_stream << "// tilelang target: " << target_str << "\n";
+  decl_stream << "#define TVM_EXPORTS\n";
+  decl_stream << "#include \"tvm/runtime/base.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  decl_stream << "#include \"tvm/ffi/c_api.h\"\n";
+  decl_stream << "#include <math.h>\n";
+  // snprintf for richer assert messages with actual values
+  decl_stream << "#include <stdio.h>\n";
+  decl_stream << "#include <stdbool.h>\n";
+  CodeGenCHost::InitGlobalContext();
+  tvm::codegen::CodeGenC::Init(output_ssa);
+}
+
+void CodeGenCHost::InitGlobalContext() {
+  decl_stream << "void* " << tvm::ffi::symbol::tvm_ffi_library_ctx
+              << " = NULL;\n";
+}
+
+void CodeGenCHost::DefineModuleName() {
+  decl_stream << "void* " << module_name_ << " = NULL;\n";
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func) {
+  return AddFunction(gvar, func, /*emit_fwd_func_decl=*/false);
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func,
+                               bool emit_fwd_func_decl) {
+  auto global_symbol =
+      func->GetAttr<tvm::ffi::String>(tvm::attr::kGlobalSymbol);
+  if (global_symbol) {
+    function_names_.push_back(global_symbol.value());
+  }
+
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  tvm::codegen::CodeGenC::AddFunction(gvar, func);
+  if (func->HasNonzeroAttr(tvm::tir::attr::kIsEntryFunc) && !has_main_func_) {
+    ICHECK(global_symbol.has_value())
+        << "CodeGenCHost: The entry func must have the global_symbol "
+           "attribute, "
+        << "but function " << gvar << " only has attributes " << func->attrs;
+    function_names_.push_back(tvm::ffi::symbol::tvm_ffi_main);
+    stream << "// CodegenC: NOTE: Auto-generated entry function\n";
+    PrintFuncPrefix(stream);
+    PrintType(func->ret_type, stream);
+    stream << " " << tvm::ffi::symbol::tvm_ffi_main
+           << "(void* self, void* args,int num_args, void* result) {\n";
+    stream << "  return " << static_cast<std::string>(global_symbol.value())
+           << "(self, args, num_args, result);\n";
+    stream << "}\n";
+    has_main_func_ = true;
+  }
+}
+
+void CodeGenCHost::GenerateForwardFunctionDeclarations(
+    tvm::ffi::String global_symbol, const tvm::ffi::Array<tvm::Type> &arg_types,
+    const tvm::Type &ret_type) {
+  if (!emit_fwd_func_decl_) {
+    return;
+  }
+  for (auto &func_already_defined : GetFunctionNames()) {
+    if (global_symbol == func_already_defined) {
+      return;
+    }
+  }
+  this->PrintFuncPrefix(fwd_decl_stream);
+  this->PrintType(ret_type, fwd_decl_stream);
+  fwd_decl_stream << " " << global_symbol << "(";
+  for (size_t i = 0; i < arg_types.size(); ++i) {
+    if (i > 0) {
+      fwd_decl_stream << ", ";
+    }
+    tvm::codegen::CodeGenSourceBase::PrintType(arg_types[i], fwd_decl_stream);
+  }
+  fwd_decl_stream << ");\n";
+}
+
+void CodeGenCHost::PrintFuncPrefix(std::ostream &os) { // NOLINT(*)
+  os << "#ifdef __cplusplus\n"
+     << "extern \"C\"\n"
+     << "#endif\n";
+}
+
+void CodeGenCHost::PrintType(tvm::DataType t, std::ostream &os) { // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    ICHECK_EQ(lanes, 1) << "does not support vector types";
+    os << "void*";
+    return;
+  }
+  if (t.is_void()) {
+    os << "void";
+    return;
+  }
+  if (t == tvm::DataType::Bool()) {
+    os << "bool";
+    return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+    case 16:
+      os << "half";
+      break;
+    case 32:
+      os << "float";
+      break;
+    case 64:
+      os << "double";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  if (t.is_bfloat16()) {
+    os << "__bf16";
+    return;
+  }
+  if (t.is_int() || t.is_uint()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    switch (t.bits()) {
+    case 8:
+      os << "int8_t";
+      break;
+    case 16:
+      os << "int16_t";
+      break;
+    case 32:
+      os << "int32_t";
+      break;
+    case 64:
+      os << "int64_t";
+      break;
+    case 1:
+      os << "int32_t";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::BroadcastNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  int lanes = op->dtype.lanes();
+  os << "((";
+  PrintType(op->dtype, os);
+  os << ")(";
+  for (int i = 0; i < lanes; ++i) {
+    if (i != 0)
+      os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenCHost::PrintGetFuncFromBackend(
+    const std::string &func_name, const std::string &packed_func_name) {
+  this->PrintIndent();
+  this->stream << "if (" << packed_func_name << " == NULL) {\n";
+  int packed_func_if_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name_ << ", \""
+               << func_name << "\""
+               << ", &" << packed_func_name << ") != 0) {\n";
+  int get_func_env_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(get_func_env_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+  this->EndScope(packed_func_if_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::PrintCallPacked(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *func_name = op->args[0].as<StringImmNode>();
+  ICHECK(func_name != nullptr)
+      << "tvm_call_[c]packed_lowered expects first argument as function name";
+  int64_t begin = op->args[2].as<IntImmNode>()->value;
+  int64_t end = op->args[3].as<IntImmNode>()->value;
+  int64_t num_args = end - begin;
+  ICHECK_GE(num_args, 0);
+
+  std::string packed_func_name;
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    packed_func_name = GetPackedName(op);
+    this->PrintGetFuncFromBackend(func_name->value, packed_func_name);
+  } else {
+    // directly use the original symbol
+    ICHECK(op->op.same_as(builtin::tvm_call_cpacked_lowered()));
+    packed_func_name =
+        tvm::ffi::symbol::tvm_ffi_symbol_prefix + func_name->value;
+  }
+
+  std::string args_stack = PrintExpr(op->args[1]);
+  this->PrintIndent();
+  std::string result = name_supply_->FreshName("result");
+  this->stream << "TVMFFIAny " << result << ";\n";
+  this->PrintIndent();
+  // must make sure type_index is set to none
+  this->stream << result << ".type_index = kTVMFFINone;\n";
+  this->PrintIndent();
+  this->stream << result << ".zero_padding = 0;\n";
+  this->PrintIndent();
+  this->stream << result << ".v_int64 = 0;\n";
+  this->PrintIndent();
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->stream << "if (TVMFFIFunctionCall(" << packed_func_name << ", ";
+  } else {
+    this->stream << "if (" << packed_func_name << "(NULL, ";
+  }
+  this->stream << "(TVMFFIAny*) " << args_stack << ", " << num_args << ", "
+               << "&" << result << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(func_call_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+std::string CodeGenCHost::GetPackedName(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *s = op->args[0].as<StringImmNode>();
+  ICHECK(s != nullptr)
+      << "tvm_call_packed_lowered expects first argument as function name";
+  std::string func_name = s->value;
+  std::string packed_func_name = func_name + "_packed";
+  std::string unique_name;
+  auto it = declared_globals_.find(packed_func_name);
+  if (it != declared_globals_.end()) {
+    unique_name = it->second;
+  } else {
+    unique_name = name_supply_->FreshName(packed_func_name);
+    declared_globals_[packed_func_name] = unique_name;
+    decl_stream << "static void* " << unique_name << " = NULL;\n";
+  }
+  return unique_name;
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::CallNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  using namespace tvm::tir;
+  if (op->op.same_as(builtin::tvm_stack_alloca())) {
+    std::string stack_name = name_supply_->FreshName("stack");
+    const std::string &type = op->args[0].as<StringImmNode>()->value;
+    const IntImmNode *num = op->args[1].as<IntImmNode>();
+    ICHECK(num != nullptr);
+    static_assert(alignof(TVMFFIAny) % alignof(DLTensor) == 0, "invariant");
+    size_t unit = sizeof(TVMFFIAny);
+    size_t size = 0;
+    if (type == "shape") {
+      size = (num->value * sizeof(ffi::Shape::index_type) + unit - 1) / unit;
+    } else if (type == "tvm_ffi_any") {
+      size = (num->value * sizeof(TVMFFIAny) + unit - 1) / unit;
+    } else if (type == "array") {
+      size = (num->value * sizeof(DLTensor) + unit - 1) / unit;
+    } else {
+      LOG(FATAL) << "Unknown stack alloca type " << type;
+    }
+    this->PrintIndent();
+    this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    os << stack_name;
+  } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
+    this->PrintIndent();
+    this->stream << "return -1;\n";
+  } else {
+    tvm::codegen::CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
+  if (emit_asserts_) {
+    std::string cond = PrintExpr(op->condition);
+    PrintIndent();
+    stream << "if (!(" << cond << ")) {\n";
+    int assert_if_scope = this->BeginScope();
+    {
+      // Prepare the base error message: allow StringImm or general PrimExpr
+      const auto *msg_node = op->message.as<tvm::tir::StringImmNode>();
+      bool msg_is_literal = (msg_node != nullptr);
+      std::string esc_msg;
+      std::string msg_expr;
+      if (msg_is_literal) {
+        const std::string &raw_msg = msg_node->value;
+        esc_msg = tvm::support::StrEscape(
+            raw_msg.c_str(), raw_msg.length(), /*use_octal_escape=*/true,
+            /*escape_whitespace_special_chars=*/true);
+      } else {
+        msg_expr = PrintExpr(op->message);
+      }
+
+      // Only print expected/got values for equality when message is StringImm
+      if (msg_is_literal) {
+        if (const auto *eq = op->condition.as<tvm::tir::EQNode>()) {
+          std::string lhs = PrintExpr(eq->a);
+          std::string rhs = PrintExpr(eq->b);
+          PrintIndent();
+          stream << "char __tvm_assert_msg_buf[512];\n";
+          PrintIndent();
+          stream << "snprintf(__tvm_assert_msg_buf, 512, \"%s; expected: %lld, "
+                    "got: %lld\", \""
+                 << esc_msg << "\", (long long)(" << lhs << "), (long long)("
+                 << rhs << "));\n";
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", "
+                    "__tvm_assert_msg_buf);\n";
+        } else {
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", \""
+                 << esc_msg << "\");\n";
+        }
+      } else {
+        PrintIndent();
+        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", " << msg_expr
+               << ");\n";
+      }
+    }
+    PrintIndent();
+    stream << "return -1;\n";
+    this->EndScope(assert_if_scope);
+    PrintIndent();
+    stream << "}\n";
+  }
+  this->PrintStmt(op->body);
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MinNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, "<", os);
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MaxNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, ">", os);
+}
+
+template <typename T>
+inline void CodeGenCHost::PrintTernaryCondExpr(const T *op, const char *compare,
+                                               std::ostream &os) { // NOLINT(*)
+  std::ostringstream temp_a;
+  VisitExpr(op->a, temp_a);
+  std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
+  std::ostringstream temp_b;
+  VisitExpr(op->b, temp_b);
+  std::string b_id = SSAGetID(temp_b.str(), op->b.dtype());
+
+  os << "((" << a_id << ") " << compare << " (" << b_id << ") "
+     << "? (" << a_id << ") : (" << b_id << "))";
+}
+
+} // namespace tl
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using tvm::codegen::CodeGenSourceBase;
+using tvm::codegen::CSourceModuleCreate;
+using tvm::ffi::Array;
+using tvm::ffi::Map;
+using tvm::ffi::Module;
+using tvm::ffi::String;
+
+// Build function that mirrors TVM's host C codegen, registered under a
+// TileLang-specific name.
+::tvm::ffi::Module BuildTileLangCHost(::tvm::IRModule mod,
+                                      ::tvm::Target target) {
+  bool output_ssa = false;
+  bool emit_asserts = true;
+  bool emit_fwd_func_decl = true;
+
+  std::unordered_set<std::string> devices;
+  if (mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+          "device_contexts") != nullptr) {
+    ::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String> device_contexts =
+        mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+               "device_contexts")
+            .value();
+    for (auto const &context : device_contexts) {
+      devices.insert(context.second.data());
+    }
+  }
+
+  CodeGenCHost cg;
+  cg.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str(), devices);
+  cg.SetConstantsByteAlignment(
+      target->GetAttr<::tvm::Integer>("constants-byte-alignment").value_or(16));
+
+  auto is_aot_executor_fn = [](::tvm::tir::PrimFunc const &func) -> bool {
+    return func->GetAttr<::tvm::Bool>("runner_function", ::tvm::Bool(false))
+        .value();
+  };
+
+  std::vector<std::pair<::tvm::GlobalVar, ::tvm::tir::PrimFunc>> funcs;
+  for (auto [gvar, base_func] : mod->functions) {
+    ICHECK(base_func->IsInstance<::tvm::tir::PrimFuncNode>())
+        << "CodegenCHost: Can only take PrimFunc";
+    auto prim_func = ::tvm::Downcast<::tvm::tir::PrimFunc>(base_func);
+    funcs.push_back({gvar, prim_func});
+  }
+
+  auto sort_key = [&is_aot_executor_fn](const auto &kv) {
+    return std::tuple{is_aot_executor_fn(kv.second), kv.first->name_hint};
+  };
+  std::sort(funcs.begin(), funcs.end(),
+            [&sort_key](const auto &kv_a, const auto &kv_b) {
+              return sort_key(kv_a) < sort_key(kv_b);
+            });
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.DeclareFunction(gvar, prim_func);
+  }
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.AddFunction(gvar, prim_func, emit_fwd_func_decl);
+  }
+
+  std::string code = cg.Finish();
+  return ::tvm::codegen::CSourceModuleCreate(code, "c", cg.GetFunctionNames());
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_c", BuildTileLangCHost);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/target/codegen_c_host.h b/src/target/codegen_c_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d54cb4ad9496ef6c86f93b4412f3c15148fe937
--- /dev/null
+++ b/src/target/codegen_c_host.h
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.h
+ * \brief Generate C host code (TileLang copy).
+ */
+#ifndef TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#define TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "target/source/codegen_c.h"
+#include "tvm/target/codegen.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+// TileLang copy of TVM's CodeGenCHost, under the tl namespace.
+// Inherits from tvm::codegen::CodeGenC.
+class CodeGenCHost : public tvm::codegen::CodeGenC {
+public:
+  CodeGenCHost();
+  void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl,
+            std::string target_str,
+            const std::unordered_set<std::string> &devices);
+
+  void InitGlobalContext();
+
+  void AddFunction(const tvm::GlobalVar &gvar,
+                   const tvm::tir::PrimFunc &f) override;
+  void AddFunction(const tvm::GlobalVar &gvar, const tvm::tir::PrimFunc &f,
+                   bool emit_fwd_func_decl);
+  /*!
+   * \brief Add functions from the (unordered) range to the current module in a
+   * deterministic order. This helps with debugging.
+   *
+   * \param functions A vector of unordered range of current module.
+   */
+  void AddFunctionsOrdered(
+      std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> functions);
+  void DefineModuleName();
+
+  using tvm::codegen::CodeGenC::PrintType;
+  void PrintType(tvm::DataType t, std::ostream &os) final; // NOLINT(*)
+  void PrintFuncPrefix(std::ostream &os) final;            // NOLINT(*)
+
+  // overload visitor functions
+  void VisitExpr_(const tvm::tir::BroadcastNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::CallNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  // overload min and max to use the ternary operator, so we don't rely on the
+  // standard library implementations
+  void VisitExpr_(const tvm::tir::MinNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::MaxNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+
+  void VisitStmt_(const tvm::tir::AssertStmtNode *op) final; // NOLINT(*)
+
+  void GenerateForwardFunctionDeclarations(
+      tvm::ffi::String global_symbol,
+      const tvm::ffi::Array<tvm::Type> &arg_types,
+      const tvm::Type &ret_type) override;
+  tvm::ffi::Array<tvm::ffi::String> GetFunctionNames() {
+    return function_names_;
+  }
+
+private:
+  std::string module_name_;
+  /* \brief mapping global packed func to the unique name */
+  std::unordered_map<std::string, std::string> declared_globals_;
+  /* \brief names of the functions declared in this module */
+  tvm::ffi::Array<tvm::ffi::String> function_names_;
+  /*! \brief whether to emit asserts in the resulting C code */
+  bool emit_asserts_;
+  /*! \brief whether to emit forwared function declarations in the resulting C
+   * code */
+  bool emit_fwd_func_decl_;
+  /*! \brief whether to generate the entry function if encountered */
+  bool has_main_func_ = false;
+
+  std::string GetPackedName(const tvm::tir::CallNode *op);
+  void PrintGetFuncFromBackend(const std::string &func_name,
+                               const std::string &packed_func_name);
+  void PrintCallPacked(const tvm::tir::CallNode *op);
+  /*!
+   * \brief Print ternary conditional operator implementing binary `op`
+   * Forces the operands to be in SSA form.
+   * \param op binary operator being expressed
+   * \param compare string representation of comparison operator
+   * \param os stream reference to print into
+   */
+  template <typename T>
+  inline void PrintTernaryCondExpr(const T *op, const char *compare,
+                                   std::ostream &os); // NOLINT(*)
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
diff --git a/src/target/codegen_cpp.cc b/src/target/codegen_cpp.cc
index 9accf5303bb6413e4c03cf4e5bf7a0dd2df79736..4f736bb066810d7d9abc720f72bfa1c3ab4dd873 100644
--- a/src/target/codegen_cpp.cc
+++ b/src/target/codegen_cpp.cc
@@ -29,6 +29,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "../op/builtin.h"
 #include "../support/ffi_aliases.h"
 #include "support/str_escape.h"
 #include "target/build_common.h"
@@ -203,12 +204,12 @@ void CodeGenTileLangCPP::PrintFuncCall(const std::string &packed_func_name,
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
   this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
-               << "(TVMValue*) stack_value"
+               << "(TVMFFIAny*) stack_value"
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
@@ -228,13 +229,13 @@ void CodeGenTileLangCPP::PrintFuncCallC(
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
 
   this->stream << "if (" << packed_func_name << "( "
-               << "(TVMValue*) stack_value "
+               << "(TVMFFIAny*) stack_value "
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
@@ -260,6 +261,12 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -294,7 +301,7 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 6b5f5063cf9e9f3df2cfa2a72f82db35089a189c..657871d8fb3c36e2c7aa20a2e5a490e49fc05958 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -107,7 +107,7 @@ struct CUDAIEEEMath {
   }
 };
 
-static std::string GetFP8Type(DataType type) {
+static std::string GetTileLangFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -131,16 +131,17 @@ static std::string GetFP8Type(DataType type) {
   if (type.is_float8_e4m3fn() || type.is_float8_e4m3fnuz() ||
       type.is_float8_e4m3()) {
     stream << "fp8_e4" << vec << "_t";
-  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
-             type.is_float8_e5m2()) {
+  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz()) {
     stream << "fp8_e5" << vec << "_t";
+  } else if (type.is_float8_e8m0fnu()) {
+    stream << "fp8_e8" << vec << "_t";
   } else {
     LOG(FATAL) << "Unsupported FP8 type in CUDA codegen but got " << type;
   }
   return stream.str();
 }
 
-std::string GetFP6Type(DataType type) {
+std::string GetTileLangFP6Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -171,32 +172,37 @@ std::string GetFP6Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP4Type(DataType type) {
+std::string GetTileLangFP4Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
   if (type.is_scalar()) {
     vec = "";
   } else if (lanes == 2) {
-    vec = "x2";
+    vec = "_2";
   } else if (lanes == 4) {
-    vec = "x4";
+    vec = "_4";
   } else if (lanes == 8) {
-    vec = "x8";
+    vec = "_8";
   } else if (lanes == 16) {
-    vec = "x16";
+    vec = "_16";
+  } else if (lanes == 32) {
+    vec = "_32";
+  } else if (lanes == 64) {
+    vec = "_64";
   } else {
-    LOG(FATAL)
-        << "Only support scalar and vector types of width (2, 4) for FP4";
+    LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8, 16, "
+                  "32, 64) for FP4";
   }
-  stream << "__nv_fp4";
+
   std::string suffix;
   if (type.code() == DataType::kFloat4_e2m1fn) {
-    suffix = "_e2m1";
+    suffix = "_e2";
   } else {
     LOG(FATAL) << "Unsupported FP4 type in CUDA codegen";
   }
-  stream << vec << suffix;
+
+  stream << "fp4" << suffix << vec << "_t";
   return stream.str();
 }
 
@@ -278,6 +284,9 @@ std::string CodeGenTileLangCUDA::Finish() {
   if (enable_fp8_) {
     decl_stream << "#include <tl_templates/cuda/cuda_fp8.h>\n";
   }
+  if (enable_fp4_) {
+    decl_stream << "#include <tl_templates/cuda/cuda_fp4.h>\n";
+  }
 
   if (need_math_constants_h_) {
     decl_stream << "#include <math_constants.h>\n";
@@ -287,6 +296,10 @@ std::string CodeGenTileLangCUDA::Finish() {
     decl_stream << "#include <cooperative_groups.h>\n";
   }
 
+  if (need_curand_kernel_h_) {
+    decl_stream << "#include <curand_kernel.h>\n";
+  }
+
   decl_stream << "#include <tl_templates/cuda/gemm.h>\n";
   if (enable_sparse_gemm_) {
     decl_stream << "#include <tl_templates/cuda/gemm_sp.h>\n";
@@ -312,7 +325,12 @@ std::string CodeGenTileLangCUDA::Finish() {
 void CodeGenTileLangCUDA::VisitStmt_(const tir::ForNode *op) {
   if (op->kind == tir::ForKind::kUnrolled) {
     PrintIndent();
-    stream << "#pragma unroll\n";
+    if (unroll_factor.count(op->loop_var.get())) {
+      stream << "#pragma unroll "
+             << PrintExpr(unroll_factor[op->loop_var.get()]) << "\n";
+    } else {
+      stream << "#pragma unroll\n";
+    }
   }
   std::string extent =
       PrintExpr(arith::Analyzer().Simplify(op->extent + op->min));
@@ -432,18 +450,20 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
       return;
   } else if (t.is_float8()) {
     enable_fp8_ = true;
-    os << GetFP8Type(t);
+    os << GetTileLangFP8Type(t);
     return;
   } else if (t.is_float6()) {
     enable_fp6_ = true;
     if (t.lanes() <= 4) {
-      os << GetFP6Type(t);
+      os << GetTileLangFP6Type(t);
     }
     return;
   } else if (t.is_float4()) {
     enable_fp4_ = true;
-    if (t.lanes() <= 4) {
-      os << GetFP4Type(t);
+    if (t.lanes() <= 64) {
+      os << GetTileLangFP4Type(t);
+    } else {
+      fail = true;
     }
     return;
   } else if (t == DataType::Bool()) {
@@ -660,7 +680,9 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  ICHECK(i >= 0 && i < 256 / t.bits());
+  ICHECK(i >= 0 && i < 256 / t.bits())
+      << "i: " << i << " t: " << t << " t.bits(): " << t.bits()
+      << " t.lanes(): " << t.lanes();
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     std::string type_name = t.is_int() ? "char" : "unsigned char";
     if (t.lanes() == 2 || t.lanes() == 3) {
@@ -702,6 +724,22 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
       os << "." << access[(i % 8) / 4];
     // fp8_e5_4_t or fp8_e5_2_t
     os << "." << access[i % 4];
+  } else if (t.is_float4_e2m1fn()) {
+    os << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      os << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      os << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      os << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      os << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t or fp4_e2_2_t
+    os << "." << access[i % 4];
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -805,6 +843,22 @@ void CodeGenTileLangCUDA::PrintVecElemStore(const std::string &vec, DataType t,
     ICHECK(!type_name.empty());
     stream << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2]
            << ")))->" << access[i % 2] << " = " << value << ";\n";
+  } else if (t.is_float4_e2m1fn()) {
+    stream << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      stream << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      stream << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      stream << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      stream << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t or fp4_e2_2_t
+    stream << "." << access[i % 4] << " = " << value << ";\n";
   } else {
     stream << vec << "." << access[i] << " = " << value << ";\n";
   }
@@ -1073,8 +1127,10 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   }
 
   // Handle conversion from float32 to float8 (E4M3/E5M2)
-  if (from_ty.is_float() &&
-      (target_ty.is_float8_e4m3() || target_ty.is_float8_e5m2())) {
+  if (from_ty.is_float() && (target_ty.is_float8())) {
+    bool target_type_is_e4m3 = target_ty.is_float8_e4m3() ||
+                               target_ty.is_float8_e4m3fn() ||
+                               target_ty.is_float8_e4m3fnuz();
     // FP32 -> FP8: Use __nv_cvt_float2_to_fp8x2 for vectorized conversion
     // (float2 -> fp8x2)
     if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
@@ -1083,8 +1139,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       stream << "*reinterpret_cast<__nv_fp8x2_storage_t*>(&(" << sret
              << ")) = __nv_cvt_float2_to_fp8x2(*reinterpret_cast<float2*>(&("
              << src << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       os << sret;
       return;
     } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
@@ -1093,14 +1148,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
              << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
              << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+1), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       os << sret;
       return;
     } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
@@ -1109,25 +1162,79 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
              << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
              << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+1), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[2] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+2), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[3] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+3), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
+      os << sret;
+      return;
+    }
+  }
+
+  if (from_ty.is_float8() && target_ty.is_float()) {
+    bool from_type_is_e4m3 = from_ty.is_float8_e4m3() ||
+                             from_ty.is_float8_e4m3fn() ||
+                             from_ty.is_float8_e4m3fnuz();
+    // FP8 -> FP32: Use __tl_cvt_fp8x2_to_float2 for vectorized conversion
+    // (fp8x2 -> float2)
+    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      // fp8x2 -> float2
+      PrintIndent();
+      stream << "*reinterpret_cast<float2*>(&(" << sret
+             << ")) = "
+                "__tl_cvt_fp8x2_to_float2(*reinterpret_cast<__nv_fp8x2_storage_"
+                "t*>(&("
+             << src << ")), " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      os << sret;
+      return;
+    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
+      // fp8x4 -> float4
+      PrintIndent();
+      stream << "*(float2*)(&" << sret << ") = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[0], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+1) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[1], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      os << sret;
+      return;
+    } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
+      // fp8x8 -> float8
+      PrintIndent();
+      stream << "*(float2*)(&" << sret << ") = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[0], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+1) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[1], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+2) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[2], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+3) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[3], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       os << sret;
       return;
@@ -1297,7 +1404,7 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
     return os.str();
   }
   std::string index_str = PrintExpr(index);
-  if (t.bits() == 4 || (t.bits() == 1 && t.is_int())) {
+  if ((t.bits() == 4 && !t.is_float4()) || (t.bits() == 1 && t.is_int())) {
     // This is a special case, because CodegenCUDA::PrintType()
     // returns "int" for bool and for 4-bit integers. In most cases,
     // we divide by the number of lanes to determine the index.
@@ -1645,10 +1752,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::sync_grid())) {
     this->need_cooperative_groups_ = true;
     this->PrintIndent();
-    this->stream << "cooperative_groups::grid_group grid = "
-                    "cooperative_groups::this_grid();\n";
-    this->PrintIndent();
-    this->stream << "grid.sync();\n";
+    this->stream << "cooperative_groups::this_grid().sync();\n";
   } else if (op->op.same_as(tl::loop_break())) {
     this->PrintIndent();
     this->stream << "break;\n";
@@ -2352,6 +2456,23 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
            << ")), \"r\"((int)" << guard << ")\n";
     stream << ");\n";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // Explicit read-only cached load. Preferred form: __ldg(BufferLoad(...)).
+    // Fallback form: __ldg(buffer, index)
+    const BufferLoadNode *bl = nullptr;
+    if (!op->args.empty()) {
+      bl = op->args[0].as<BufferLoadNode>();
+    }
+    if (bl == nullptr) {
+      LOG(FATAL) << "T.__ldg expects a BufferLoad as the first argument.";
+    }
+    const BufferNode *buffer = bl->buffer.get();
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    PrimExpr base = bl->indices[0];
+    // Emit __ldg(&buffer_ref)
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << "__ldg(&(" << buffer_ref << "))";
   } else if (op->op.same_as(builtin::reinterpret())) {
     DataType tgt_dtype = op->dtype;
     DataType src_dtype = op->args[0]->dtype;
@@ -2612,6 +2733,30 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string func_name = math_func(op->dtype, "fdiv", rounding_mode);
     os << func_name << "(" << PrintExpr(op->args[0]) << ", "
        << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::rng_init())) {
+    this->need_curand_kernel_h_ = true;
+    this->curand_philox_state = name_supply_->FreshName("__philox_state");
+    this->PrintIndent();
+    this->stream << "curandStatePhilox4_32_10_t " << this->curand_philox_state
+                 << ";\n";
+    this->PrintIndent();
+    this->stream << "curand_init(" << PrintExpr(op->args[0]) << ", "
+                 << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2])
+                 << ", &" << this->curand_philox_state << ");\n";
+    // Store state_var for later use by rng_rand
+  } else if (op->op.same_as(tl::rng_rand())) {
+    this->need_curand_kernel_h_ = true;
+    os << "curand(&" << this->curand_philox_state << ")";
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    os << "tl::warp_reduce_max(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    os << "tl::warp_reduce_min(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -2654,7 +2799,12 @@ void CodeGenTileLangCUDA::VisitStmt_(const AttrStmtNode *op) {
     this->stream << "const dim3 blockIdx = " << pattern->value << "();\n";
     this->VisitStmt(op->body);
     return;
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
   }
+
   CodeGenC::VisitStmt_(op);
 }
 
@@ -2798,7 +2948,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   } else {
     bool can_vector_load = false;
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    // For sub-byte types with lanes > 1 in element_dtype, adjust the ramp
+    // pattern
+    int ramp_lanes = (element_dtype.lanes() > 1 && element_dtype.bits() < 8)
+                         ? value_dtype.lanes() / element_dtype.lanes()
+                         : value_dtype.lanes();
+    if (arith::ramp(base, 1, ramp_lanes).Match(index)) {
       const RampNode *ramp = index.as<RampNode>();
       ICHECK(ramp);
       can_vector_load = true;
@@ -2810,11 +2965,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
       // }
     }
 
-    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
-      // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
-      // So we cannot vector load it.
-      can_vector_load = false;
-    }
     if (can_vector_load) {
       std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
       HandleVolatileLoads(ref, op, os);
@@ -2848,6 +2998,69 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   }
 }
 
+void CodeGenTileLangCUDA::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    std::string value = this->PrintExpr(op->value);
+    std::string ref =
+        this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
+    this->PrintIndent();
+    stream << ref << " = " << value << ";\n";
+  } else {
+    arith::PVar<PrimExpr> base;
+    // For sub-byte types with lanes > 1 in element_dtype, adjust the ramp
+    // pattern
+    int ramp_lanes = (element_dtype.lanes() > 1 && element_dtype.bits() < 8)
+                         ? value_dtype.lanes() / element_dtype.lanes()
+                         : value_dtype.lanes();
+
+    if (arith::ramp(base, 1, ramp_lanes).Match(index_expr)) {
+      std::string value = this->PrintExpr(op->value);
+      this->PrintVecStore(op->buffer.get(), value_dtype, base.Eval(), value);
+    } else {
+      // The assignment below introduces side-effect, and the resulting value
+      // cannot be reused across multiple expression, thus a new scope is needed
+      int vec_scope = BeginScope();
+
+      // store elements separately
+      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.dtype());
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.dtype());
+      std::string vid = GetVarID(buffer_var.get());
+      for (int i = 0; i < value_dtype.lanes(); ++i) {
+        this->PrintIndent();
+        DataType elem_type = value_dtype.element_of();
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          stream << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, stream);
+            }
+          }
+          PrintType(elem_type, stream);
+          stream << "*)" << vid << ')';
+        } else {
+          stream << vid;
+        }
+        stream << '[';
+        PrintVecElemLoad(index, index_expr.dtype(), i, stream);
+        stream << "] = ";
+        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        stream << ";\n";
+      }
+      EndScope(vec_scope);
+    }
+  }
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
@@ -3212,6 +3425,20 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
   CodeGenC::PrintType(func->ret_type, os);
   CodeGenC::PrintExtraAttrs(func, os);
   bool no_alias = func->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          func->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt =
+          func->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
   os << " " << function_name << "(";
   for (size_t i = 0; i < func->params.size(); ++i) {
     tir::Var v = func->params[i];
@@ -3236,7 +3463,10 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, os);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        os << "const ";
+      }
       CodeGenC::PrintType(GetType(v), os);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
@@ -3244,7 +3474,7 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, os);
       }
     } else {
@@ -3280,6 +3510,19 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt = f->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -3307,7 +3550,10 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, stream);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        stream << "const ";
+      }
       CodeGenC::PrintType(GetType(v), stream);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
@@ -3315,7 +3561,7 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 6f229f11dddf8c602bf0214676b2c316911059e1..9cf4602133e3ab658d263110d269878b1db18eda 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -57,6 +57,7 @@ public:
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
   void VisitExpr_(const BufferLoadNode *op, std::ostream &os) final;
+  void VisitStmt_(const BufferStoreNode *op) final;
 
   // Override this as a work around for __grid_constant__ parameter
   void AddFunction(const GlobalVar &gvar, const PrimFunc &f);
@@ -87,6 +88,8 @@ private:
   std::string vid_global_barrier_state_;
   // Global barrier expected node.
   std::string vid_global_barrier_expect_;
+  // Global curand state
+  std::string curand_philox_state;
 
   // whether enable fp16
   bool enable_fp16_{false};
@@ -122,6 +125,8 @@ private:
   bool need_cast_smem_ptr_to_int_{false};
   // whether need cooperative_groups.h
   bool need_cooperative_groups_{false};
+  // whether need curand_kernel.h
+  bool need_curand_kernel_h_{false};
   // Op attribute map
   OpAttrMap<bool> op_need_warp_shuffle_ =
       Op::GetAttrMap<bool>("cuda.need_warp_shuffle");
@@ -140,6 +145,7 @@ private:
 
   std::unordered_map<const VarNode *, std::string> fragment_shapes;
   std::unordered_map<const VarNode *, std::string> fragment_layouts;
+  std::unordered_map<const VarNode *, IntImm> unroll_factor;
   friend void PrintConst(const FloatImmNode *op, std::ostream &os,
                          CodeGenTileLangCUDA *p);
   void PrintWmmaScope(const std::string &scope, DataType t,
diff --git a/src/target/codegen_cutedsl.cc b/src/target/codegen_cutedsl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8279710de4bdf7b987cddca985b1869e8e18b5ae
--- /dev/null
+++ b/src/target/codegen_cutedsl.cc
@@ -0,0 +1,1355 @@
+/*!
+ * \file target/codegen_cutedsl.cc
+ */
+
+#include "codegen_cutedsl.h"
+#include "codegen_utils.h"
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/index_map.h>
+#include <tvm/tir/op.h>
+
+#include <cmath>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../op/builtin.h"
+#include "arith/pattern_match.h"
+
+namespace tvm {
+namespace codegen {
+namespace {
+
+// The threshold of the loop extent to use cutlass.range_constexpr
+// Higher values would lead to DSLOptimizationWarning:
+// This static loop has 128 iterations, which may be very slow to compile,
+//  consider using `cutlass.range(..., unroll_full=True)` instead.
+const int64_t LOOP_UNROLL_THRESHOLD = 64;
+
+void ReplaceAll(std::string &str, const std::string &from,
+                const std::string &to) {
+  ICHECK(!from.empty()) << "ReplaceAll(): `from` must be non-empty";
+  auto pos = str.find(from);
+  while (pos != std::string::npos) {
+    str.replace(pos, from.size(), to);
+    pos = str.find(from, pos + to.size());
+  }
+}
+
+} // namespace
+
+CodeGenTileLangCuTeDSL::CodeGenTileLangCuTeDSL() {
+  // Read fastmath configuration from current PassContext
+  auto pass_ctx = tvm::transform::PassContext::Current();
+
+  // Read tl.enable_fast_math config, default to false
+  enable_fastmath_ =
+      pass_ctx->GetConfig<Bool>(tl::kEnableFastMath, Bool(false)).value();
+}
+
+std::string CodeGenTileLangCuTeDSL::CanonicalizeFastmathFunctionName_(
+    const std::string &func_name) const {
+  static const std::unordered_map<std::string, std::string> kFastMathMap = {
+      {"divf", "tl.divf"},   {"exp", "tl.exp"},    {"expf", "tl.exp"},
+      {"exp2", "tl.exp2"},   {"exp2f", "tl.exp2"}, {"log", "tl.log"},
+      {"logf", "tl.log"},    {"log2", "tl.log2"},  {"log2f", "tl.log2"},
+      {"log10", "tl.log10"}, {"tan", "tl.tan"},    {"cos", "tl.cos"},
+      {"sin", "tl.sin"},     {"sqrt", "tl.sqrt"},  {"sqrtf", "tl.sqrt"},
+  };
+
+  auto it = kFastMathMap.find(func_name);
+  if (it != kFastMathMap.end()) {
+    return it->second;
+  }
+  return "";
+}
+
+void CodeGenTileLangCuTeDSL::PrintFuncDecorator_(
+    std::ostream &os) { // NOLINT(*)
+  os << "@cute.kernel\n";
+}
+
+void CodeGenTileLangCuTeDSL::PreFunctionBody_(const PrimFunc &f) {
+  PrintIndent();
+  stream << "threadIdx = tl.ThreadIdx()" << "\n";
+  PrintIndent();
+  stream << "blockIdx = tl.BlockIdx()" << "\n";
+}
+
+namespace {
+std::string DTypeToString(DataType t) {
+  ICHECK(t.is_scalar()) << "unsupported type " << t;
+
+  if (t.is_void()) {
+    return "void";
+  }
+  if (t == tl::cuTensorMapType()) {
+    return "CUtensorMap";
+  }
+
+  int bits = t.bits();
+  std::string elem_type;
+  if (t.is_float()) {
+    if (bits == 16 || bits == 32 || bits == 64) {
+      elem_type = "Float" + std::to_string(bits);
+    }
+  } else if (t.is_bfloat16()) {
+    elem_type = "BFloat16";
+  } else if (t.is_float8()) {
+    if (t.is_float8_e3m4()) {
+      // unsupported
+    } else if (t.is_float8_e4m3()) {
+      elem_type =
+          "Float8E4M3FN"; // Only Float8E4M3FN is supported at the moment
+    } else if (t.is_float8_e4m3b11fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e4m3fn()) {
+      elem_type = "Float8E4M3FN";
+    } else if (t.is_float8_e4m3fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e5m2()) {
+      elem_type = "Float8E5M2";
+    } else if (t.is_float8_e5m2fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e8m0fnu()) {
+      elem_type = "Float8E8M0FNU";
+    }
+  } else if (t.is_float6()) {
+    if (t.is_float6_e3m2fn()) {
+      elem_type = "Float6E3M2FN";
+    } else if (t.is_float6_e2m3fn()) {
+      elem_type = "Float6E2M3FN";
+    }
+  } else if (t.is_float4()) {
+    if (t.is_float4_e2m1fn()) {
+      elem_type = "Float4E2M1FN";
+    }
+  } else if (t.is_bool()) {
+    elem_type = "Boolean";
+  } else if (t.is_uint()) {
+    if (bits == 8 || bits == 16 || bits == 32 || bits == 64 || bits == 128) {
+      elem_type = "Uint" + std::to_string(bits);
+    }
+  } else if (t.is_int()) {
+    if (bits == 4 || bits == 8 || bits == 16 || bits == 32 || bits == 64 ||
+        bits == 128) {
+      elem_type = "Int" + std::to_string(bits);
+    }
+  }
+
+  if (elem_type.empty()) {
+    LOG(FATAL) << "Cannot convert type " << t << " to CuTeDSL type!";
+  }
+
+  return "cutlass." + elem_type;
+}
+} // namespace
+
+void CodeGenTileLangCuTeDSL::PrintType(DataType t,
+                                       std::ostream &os) { // NOLINT(*)
+  CHECK(t.is_scalar()) << "Should not print a non-scalar type in CuTeDSL: "
+                       << t;
+  os << DTypeToString(t);
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BroadcastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  os << "tl.make_filled_tensor((" << PrintExpr_(op->lanes) << ",), "
+     << PrintExpr_(op->value) << ").load()";
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const FloatImmNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32:
+  case 16:
+  case 8:
+  case 4: {
+    std::ostringstream temp;
+    if (std::isinf(op->value)) {
+      // For CuTeDSL, use Python's float('inf') instead of CUDA macros
+      PrintType(op->dtype, temp);
+      temp << "(";
+      if (op->value < 0) {
+        temp << "float('-inf')";
+      } else {
+        temp << "float('inf')";
+      }
+      temp << ")";
+    } else if (std::isnan(op->value)) {
+      // For CuTeDSL, use Python's float('nan')
+      PrintType(op->dtype, temp);
+      temp << "(float('nan'))";
+    } else {
+      // For CuTeDSL, use Python's float.fromhex() with hexfloat for full
+      // precision
+      PrintType(op->dtype, temp);
+      temp << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    }
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  DataType from_ty = op->value.dtype();
+  DataType target_ty = op->dtype;
+  ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
+
+  if (from_ty.is_scalar())
+    return CodeGenTileLangPY::VisitExpr_(op, os);
+
+  // Emit this as vectorized unary ops.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << target_ty.lanes() << ",), ";
+  PrintType(target_ty.element_of(), stream);
+  stream << ")\n";
+
+  std::string src = SSAGetID(PrintExpr_(op->value), from_ty);
+
+  PrintIndent();
+  stream << sret << ".store(" << src << ".to(";
+  PrintType(target_ty.element_of(), stream);
+  stream << "))\n";
+  os << sret << ".load()";
+  return;
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const DivNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    if (enable_fastmath_) {
+      os << "tl.divf(" << PrintExpr_(op->a) << ", " << PrintExpr_(op->b)
+         << ", fastmath=True)";
+    } else {
+      PrintBinaryExpr_("tl.divf", op->dtype, op->a, op->b, os);
+    }
+  }
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MinNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MaxNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.max", op->dtype, op->a, op->b, os);
+}
+
+/**
+ * @brief Emit CuTeDSL-specific code for a call expression.
+ *
+ * This visitor handles CallNode intrinsics and builtins that require emitting
+ * CuTeDSL-specific code (inline PTX/ASM sequences, TensorLanguage runtime
+ * calls, WMMA/TMA helpers, barriers, cp.async primitives, index-map based
+ * stores, reinterpret/packing helpers, and various mma/ldmatrix patterns). The
+ * function writes the generated code to the provided output stream and falls
+ * back to the Python codegen for unrecognized calls.
+ *
+ * The method recognizes and emits code for (non-exhaustive): cp.async and its
+ * commit/wait variants, tma_load/store and im2col variants, ptX
+ * ldmatrix/stmatrix helpers, mbarrier APIs, cooperative grid sync, WMMA/legacy
+ * MMA intrinsics (fill/load/store/mma/bmma/ptx_mma/ptx_mma_sp), low-level PTX
+ * asm helpers (ldg32, cp_async bulk/init/arrive/wait barriers), reinterpret
+ * paths for special small-float encodings (e.g., float4 e2m1fn), tl::tl_gemm
+ * and related external calls, and other TL runtime calls.
+ *
+ * Side effects:
+ * - Emits to `os` and the internal codegen output stream.
+ * - May set internal feature flags (e.g., need_cooperative_groups_).
+ * - May open/close SSA scopes and mutate internal variable mappings.
+ * - May call LOG(FATAL) / CHECK / ICHECK on invalid or unsupported argument
+ *   patterns.
+ *
+ * @param op The call node to generate code for; the function inspects op->op
+ *           and op->args to determine the appropriate emission.
+ * @param os  Output stream to receive expression-level output when the caller
+ *            expects an expression result (some paths write directly to the
+ *            member stream instead).
+ */
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  auto print_extern_call_stmt = [&](std::string name, size_t start = 0,
+                                    size_t end = 0) {
+    // Cache context into a private ss, otherwise the let node may generate
+    // within the function call arguments.
+    std::ostringstream ss;
+    for (size_t i = start; i < op->args.size() - end; i++) {
+      if (i > start)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+
+    PrintIndent();
+    stream << name << "(";
+    stream << ss.str();
+    stream << ")\n";
+  };
+
+  auto print_mbarrier_obj = [&](PrimExpr barrier_id) {
+    std::ostringstream ss;
+    if (barrier_id.as<IntImmNode>()) {
+      // incase the barrier_id is an integer, we need to print the barrier_id as
+      // an integer
+      ss << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+    } else {
+      // otherwise may be a T.get_mbarrier() call or BufferLoad Node
+      // we need to print the barrier_id as a string
+      ss << PrintExpr_(barrier_id);
+    }
+    return ss.str();
+  };
+
+  if (op->op.same_as(builtin::ptx_cp_async())) {
+    std::string dst = PrintExpr_(op->args[0]);
+    std::string dst_offset = PrintExpr_(op->args[1]);
+    std::string src = PrintExpr_(op->args[2]);
+    std::string src_offset = PrintExpr_(op->args[3]);
+    std::string size = PrintExpr_(op->args[4]);
+    // use size of argument list to indicate whether or not to use predicated
+    // cp.async
+    if (op->args.size() == 5) {
+      PrintIndent();
+      stream << "tl.cp_async_gs(" << size << ", " << dst << ", " << dst_offset
+             << ", " << src << ", " << src_offset << ")\n";
+    } else {
+      std::string condition = PrintExpr_(op->args[5]);
+      PrintIndent();
+      stream << "tl.cp_async_gs_conditional(" << size << ", " << dst << ", "
+             << dst_offset << ", " << src << ", " << src_offset << ", "
+             << condition << ")\n";
+    }
+  } else if (op->op.same_as(builtin::ptx_commit_group())) {
+    print_extern_call_stmt("tl.cp_async_commit");
+  } else if (op->op.same_as(builtin::ptx_wait_group())) {
+    print_extern_call_stmt("tl.cp_async_wait");
+  } else if (op->op.same_as(builtin::create_barriers())) {
+    PrintIndent();
+    int barrier_count = Downcast<IntImm>(op->args[0])->value;
+    stream << mbarrier_name_
+           << " = tl.alloc_smem(cutlass.Uint64, size_in_elems=" << barrier_count
+           << ")\n";
+  } else if (op->op.same_as(tl::get_mbarrier())) {
+    ICHECK_EQ(op->args.size(), 1);
+    std::string barrier_id = PrintExpr_(op->args[0]);
+    os << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
+    if (op->args.size() == 1) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ")\n";
+    } else if (op->args.size() == 3) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto cta_id = PrintExpr_(op->args[1]);
+      auto pred = PrintExpr_(op->args[2]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ", " << cta_id << ", "
+             << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto arrive_count = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_init(" << mbarrier_obj << ", " << arrive_count
+           << ")\n";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
+    if (op->args.size() == 2) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ")\n";
+    } else if (op->args.size() == 4) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      auto cta_id = PrintExpr_(op->args[2]);
+      auto pred = PrintExpr_(op->args[3]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ", " << cta_id << ", " << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier_expect_tx "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive");
+  } else if (op->op.same_as(tl::ptx_fence_barrier_init())) {
+    print_extern_call_stmt("tl.fence_barrier_init");
+  } else if (op->op.same_as(tl::ptx_cp_async_barrier_noinc())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive_noinc");
+  } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto transaction_bytes = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_expect_tx(" << mbarrier_obj << ", "
+           << transaction_bytes << ")\n";
+  } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto phase = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_wait(" << mbarrier_obj << ", " << phase << ")\n";
+  } else if (op->op.same_as(tl::ptx_init_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_deallocate_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::no_set_max_nreg())) {
+    // do nothing
+  } else if (op->op.same_as(tl::tma_load())) {
+    std::ostringstream ss;
+    ICHECK_GE(op->args.size(), 2);
+    auto pol = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(pol) << "Eviction policy must be IntImm";
+    ICHECK_GE(pol->value, 0);
+    ICHECK_LT(static_cast<size_t>(pol->value), eviction_policy_names_.size());
+    auto eviction_policy = eviction_policy_names_[pol->value];
+    // Simplify the code by using the default eviction policy
+    if (eviction_policy != "EVICT_NORMAL") {
+      LOG(FATAL) << "Eviction policy " << eviction_policy
+                 << " is not supported currently";
+    } else {
+      ss << "tl.tma_load(";
+    }
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << print_mbarrier_obj(op->args[1]) << ", ";
+    ss << PrintExpr_(op->args[2]) << ", (";
+    for (size_t i = 3; i < op->args.size() - 1; i++) {
+      if (i > 3)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << "))\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::tma_load_im2col())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tma_store())) {
+    std::stringstream ss;
+    // Check minimum argument count (desc, data, at least one coord,
+    // need_reduce, eviction)
+    ICHECK_GE(op->args.size(), 4) << "tma_store requires at least 4 arguments "
+                                     "(desc, data, coords..., need_reduce, "
+                                     "eviction_policy), got "
+                                  << op->args.size();
+
+    // Safely extract need_reduce flag
+    auto need_reduce_ptr = op->args[op->args.size() - 2].as<IntImmNode>();
+    ICHECK(need_reduce_ptr)
+        << "tma_store need_reduce flag (args[-2]) must be IntImm, got "
+        << op->args[op->args.size() - 2]->GetTypeKey();
+    auto need_reduce = need_reduce_ptr->value;
+    if (need_reduce) {
+      LOG(FATAL) << "Currently unsupported op: " << op->op;
+    }
+
+    // Safely extract and validate eviction policy index
+    auto eviction_idx_ptr = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(eviction_idx_ptr)
+        << "tma_store eviction policy (args[-1]) must be IntImm, got "
+        << op->args[op->args.size() - 1]->GetTypeKey();
+    ICHECK_GE(eviction_idx_ptr->value, 0)
+        << "tma_store eviction policy index must be >= 0, got "
+        << eviction_idx_ptr->value;
+    ICHECK_LT(static_cast<size_t>(eviction_idx_ptr->value),
+              eviction_policy_names_.size())
+        << "tma_store eviction policy index " << eviction_idx_ptr->value
+        << " out of bounds (max " << eviction_policy_names_.size() - 1 << ")";
+    auto eviction_policy = eviction_policy_names_[eviction_idx_ptr->value];
+
+    ss << "tl.tma_store(";
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << PrintExpr_(op->args[1]) << ", (";
+    for (size_t i = 2; i < op->args.size() - 2; i++) {
+      if (i > 2)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << ")";
+    if (eviction_policy != "EVICT_NORMAL") {
+      ss << ", eviction_kind = nvvm.EvictKind." << eviction_policy.substr(6);
+    }
+    ss << ")\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::ptx_ldmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_ldmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::ptx_stmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_stmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::fence_proxy_async())) {
+    print_extern_call_stmt("tl.fence_proxy_async");
+  } else if (op->op.same_as(tl::tma_store_arrive())) {
+    print_extern_call_stmt("tl.tma_store_arrive");
+  } else if (op->op.same_as(tl::tma_store_wait())) {
+    PrintIndent();
+    stream << "tl.tma_store_wait(0)\n";
+  } else if (op->op.same_as(tl::warpgroup_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_commit_batch())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_wait())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_fence_operand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::set_max_nreg())) {
+    PrintIndent();
+    int nreg = Downcast<IntImm>(op->args[0])->value;
+    int is_inc = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name =
+        is_inc ? "tl.warpgroup_reg_alloc" : "tl.warpgroup_reg_dealloc";
+    stream << func_name << "(" << nreg << ")\n";
+  } else if (op->op.same_as(tl::wait_wgmma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::pack_b16())) {
+    os << "tl.pack_half2(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::sync_grid())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::loop_break())) {
+    PrintIndent();
+    stream << "break\n";
+  } else if (op->op.same_as(builtin::ptx_mma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_mma_sm70())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_mma_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_rs())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ts())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_store())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_fill())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_cp_async_bulk())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_wait_barrier())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldg32())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::reinterpret())) {
+    DataType tgt_dtype = op->dtype;
+    DataType src_dtype = op->args[0]->dtype;
+    ICHECK_EQ(tgt_dtype.lanes() * tgt_dtype.bits(),
+              src_dtype.lanes() * src_dtype.bits())
+        << "reinterpret expects source and target to have the same number of "
+           "bits";
+
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+
+    PrimExpr index = load->indices[0];
+    if (const RampNode *node = index.as<RampNode>(); node) {
+      auto *p_stride = as_const_int(node->stride);
+      CHECK(p_stride);
+      ICHECK_EQ(*p_stride, 1) << "reinterpret expects contiguous elements";
+      index = node->base;
+    }
+
+    auto ptr_str = GetBufferPtr_(load->buffer.get(), index);
+    os << "tl.make_tensor(tl.recast_ptr(" << ptr_str << ", dtype=";
+    PrintType(tgt_dtype.element_of(), os);
+    os << "), (" << tgt_dtype.lanes() << ",)).load()";
+  } else if (op->op.same_as(builtin::thread_return())) {
+    os << "return";
+  } else if (op->op.same_as(tl::tl_gemm())) {
+    ICHECK(op->args.size() == 4) << "tl_gemm expects 4 arguments <op_instance, "
+                                    "A_ptr, B_ptr, C_ptr>, but got "
+                                 << op->args.size();
+
+    auto op_instance = Downcast<StringImm>(op->args[0]);
+    PrintCallExtern_(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
+                     op_instance->value, op->args, true, os);
+  } else if (op->op.same_as(tl::tl_gemm_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_lane_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx_sync())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_group_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tl_shuffle_elect())) {
+    os << "tl.shuffle_elect(" << PrintExpr_(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::initialize_wgmma_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::initialize_tcgen05_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::increase_descriptor_offset())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__exp())) {
+    os << "tl.exp2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__exp10())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__log())) {
+    os << "tl.log(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log2())) {
+    os << "tl.log2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log10())) {
+    os << "tl.log10(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__tan())) {
+    os << "tl.tan(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__cos())) {
+    os << "tl.cos(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__sin())) {
+    os << "tl.sin(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::ieee_add())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_sub())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_mul())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fmaf())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frcp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fdiv())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::address_of())) {
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+    os << GetBufferPtr_(load->buffer.get(), load->indices[0]);
+  } else {
+    CodeGenTileLangPY::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BufferLoadNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  const int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index);
+    if (ref.back() == ')') {
+      ref += ".load()";
+    }
+    os << ref;
+  } else {
+    ICHECK_GE(value_lanes, element_dtype.lanes())
+        << "Unsupported load/store: value lanes < buffer element lanes";
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      std::string ref =
+          GetBufferRef_(value_dtype, op->buffer.get(), base.Eval());
+      if (ref.back() == ')') {
+        ref += ".load()";
+      }
+      os << ref;
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous load must be scalar "
+             "currently";
+
+      std::string sret = name_supply_->FreshName("_");
+      PrintIndent();
+      stream << sret << " = tl.make_rmem_tensor((" << value_lanes << ",), ";
+      PrintType(element_dtype, stream);
+      stream << ")\n";
+
+      std::string vid = GetVarID(buffer_var.get());
+      const RampNode *ramp = index.as<RampNode>();
+      ICHECK(ramp)
+          << "Expected Ramp index for vectorized non-contiguous access";
+      for (int i = 0; i < value_lanes; ++i) {
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << sret << "[" << i << "] = "
+               << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << "\n";
+      }
+      os << sret << ".load()";
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  std::string value_str = PrintExpr_(op->value);
+
+  int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+    PrintIndent();
+
+    if (ref.back() != ')') {
+      stream << ref << " = " << RemoveOutermostParentheses(value_str) << "\n";
+    } else {
+      stream << ref << ".store(" << RemoveOutermostParentheses(value_str)
+             << ")\n";
+    }
+  } else {
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index_expr)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      PrintVecStore_(op->buffer.get(), value_dtype, base.Eval(), value_str);
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous store must be scalar "
+             "currently";
+
+      // store elements separately
+      value_str = SSAGetID(value_str, element_dtype);
+      for (int i = 0; i < value_lanes; ++i) {
+        const RampNode *ramp = index_expr.as<RampNode>();
+        ICHECK(ramp);
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << " = ";
+        PrintVecElemLoad_(value_str, value_dtype, i, stream);
+        stream << "\n";
+      }
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+  PrintIndent();
+  std::string scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  if (scope == "local.descriptor.wgmma") {
+    stream << vid << " = tl.GmmaDescriptor()\n";
+  } else if (scope == "local.descriptor.tcgen05_smem") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "local.descriptor.tcgen05_instr") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "shared.dyn") {
+    stream << vid << " = tl.make_tensor(tl.get_dyn_smem(";
+    PrintType(op->dtype, stream);
+    // there is no bound check for Tensor access, so just set shape to 1
+    stream << ", alignment=1024), (1,))\n";
+  } else {
+    size_t constant_size = op->ConstantAllocationSize();
+    ICHECK_GT(constant_size, 0)
+        << "Can only handle constant size stack allocation for now, but get "
+        << constant_size << " for " << op->buffer_var->name_hint;
+
+    if (scope == "shared") {
+      stream << vid << " = tl.make_tensor(tl.alloc_smem(";
+      PrintType(op->dtype, stream);
+      stream << ", " << constant_size << "), (" << constant_size << ",))\n";
+    } else if (scope == "shared.barrier") {
+      ICHECK(false) << "Unsupported scope: " << scope;
+    } else if (scope == "local") {
+      stream << vid << " = tl.make_rmem_tensor((" << constant_size << "),";
+      PrintType(op->dtype, stream);
+      stream << ")\n";
+    } else if (scope == "local.var") {
+      PrimExpr init = tir::make_const(op->dtype, 0);
+      auto init_it = op->annotations.find(tl::attr::kLocalVarInit);
+      if (init_it != op->annotations.end()) {
+        PrimExpr user_init = Downcast<PrimExpr>((*init_it).second);
+        if (!user_init.dtype().is_void() && user_init.dtype() != op->dtype) {
+          user_init = tir::Cast(op->dtype, user_init);
+        }
+        init = user_init;
+      }
+      stream << vid << " = " << PrintExpr_(init) << "\n";
+    } else {
+      ICHECK(false) << "Unsupported scope: " << scope;
+    }
+  }
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AttrStmtNode *op) {
+  if (op->attr_key == tir::attr::thread_extent) {
+    IterVar iv = Downcast<IterVar>(op->node);
+    if (!iv->thread_tag.empty()) {
+      if (!var_idmap_.count(iv->var.get())) {
+        BindThreadIndex_(iv);
+      }
+    }
+    VisitStmt(op->body);
+  } else if (op->attr_key == tir::attr::async_commit_queue_scope) {
+    const IntImmNode *queue_id = op->value.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    VisitStmt(op->body);
+    auto commit_group = Call(DataType::Void(), builtin::ptx_commit_group(), {});
+    VisitExpr(commit_group, stream);
+  } else if (op->attr_key == tir::attr::async_wait_queue_scope) {
+    auto wait_attrs = GetAsyncWaitAttributes(op);
+    auto queue_id = wait_attrs.first.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    auto wait_cnt = wait_attrs.second;
+    auto wait_group =
+        Call(DataType::Void(), builtin::ptx_wait_group(), {wait_cnt});
+    VisitExpr(wait_group, stream);
+    auto inner = op->body.as<AttrStmtNode>();
+    ICHECK(inner);
+    VisitStmt(inner->body);
+  } else if (op->attr_key == "threadblock_swizzle_pattern") {
+    this->PrintIndent();
+    const StringImmNode *pattern = op->value.as<StringImmNode>();
+    ICHECK(pattern);
+    std::string call_str = pattern->value;
+    // replace :: with . and replace < with ( and replace > with )
+    ReplaceAll(call_str, "::", ".");
+    ReplaceAll(call_str, "<", "(");
+    ReplaceAll(call_str, ">", ")");
+    this->stream << "blockIdx = " << call_str << "\n";
+    this->VisitStmt(op->body);
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor_[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
+    CodeGenTileLangPY::VisitStmt_(op);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const ForNode *op) {
+  if (op->kind != tir::ForKind::kUnrolled) {
+    CodeGenTileLangPY::VisitStmt_(op);
+    return;
+  }
+
+  auto start_expr = arith::Analyzer().Simplify(op->min);
+  auto stop_expr = arith::Analyzer().Simplify(op->extent + op->min);
+  std::string unroll_factor;
+  if (auto it = unroll_factor_.find(op->loop_var.get());
+      it != unroll_factor_.end()) {
+    unroll_factor = PrintExpr_(it->second);
+  }
+  bool use_range_constexpr = unroll_factor.empty() &&
+                             as_const_int(op->extent) != nullptr &&
+                             *as_const_int(op->extent) <= LOOP_UNROLL_THRESHOLD;
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in cutlass.range";
+  if (use_range_constexpr) {
+    stream << "_constexpr";
+  }
+  stream << "(";
+  if (!is_zero(start_expr)) {
+    PrintExpr_(start_expr, stream);
+    stream << ", ";
+  }
+  PrintExpr_(stop_expr, stream);
+  if (!unroll_factor.empty()) {
+    stream << ", unroll=" << unroll_factor;
+  } else if (!use_range_constexpr) {
+    stream << ", unroll_full=True";
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  if (const CallNode *call = op->condition.as<CallNode>();
+      call && call->op.same_as(tl::tl_shuffle_elect())) {
+    PrintIndent();
+    stream << "with cute.arch.elect_one():\n";
+    int with_scope = BeginScope();
+    PrintStmt_(op->then_case);
+    EndScope(with_scope);
+  } else {
+    PrintStmt_(op->then_case);
+  }
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+  const CallNode *call = op->value.as<CallNode>();
+  if (call && call->op.same_as(builtin::tvm_global_barrier_kinit())) {
+    LOG(FATAL) << "Currently unsupported op: " << call->op;
+  }
+  if (call && (call->op.same_as(tvm::tl::device_assert()))) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    PrintIndent();
+    stream << "assert " << cond << "\n";
+  } else if (call && call->op.same_as(tvm::tl::device_assert_with_msg())) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    std::string msg_expr = PrintExpr_(call->args[1]);
+    PrintIndent();
+    stream << "assert " << cond << ", " << msg_expr << "\n";
+  } else if (call && call->op.same_as(builtin::tvm_storage_sync())) {
+    PrintStorageSync_(call);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemLoad_(const std::string &vec,
+                                               DataType t, int i,
+                                               std::ostream &os) { // NOLINT(*)
+  if (t.is_scalar()) {
+    os << vec;
+    return;
+  }
+  os << vec << "[" << i << "]";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemStore_(const std::string &vec,
+                                                DataType t, int i,
+                                                const std::string &value) {
+  PrintIndent();
+  stream << vec << "[" << i << "] = " << value << "\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecStore_(const BufferNode *buffer,
+                                            DataType t, PrimExpr base,
+                                            const std::string &value) {
+  ICHECK(!t.is_scalar()) << "PrintVecStore_() should not be used for scalar";
+
+  std::string ref = GetBufferRef_(t, buffer, base);
+  PrintIndent();
+  stream << ref << ".store(" << value << ")\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecBinaryOp_(const std::string &opstr,
+                                               DataType dtype, PrimExpr lhs,
+                                               PrimExpr rhs,
+                                               std::ostream &os) { // NOLINT(*)
+  // Declare the result.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << dtype.lanes() << ",), ";
+  PrintType(dtype.element_of(), stream);
+  stream << ")\n";
+
+  std::string vlhs = SSAGetID(PrintExpr_(lhs), lhs.dtype());
+  std::string vrhs = SSAGetID(PrintExpr_(rhs), rhs.dtype());
+
+  const std::string one_char_op{"+-*%<>^|&"};
+  const std::string two_char_op{"// == != <= >="};
+  if ((opstr.size() == 1 && one_char_op.find(opstr) != std::string::npos) ||
+      (opstr.size() == 2 && two_char_op.find(opstr) != std::string::npos)) {
+    PrintIndent();
+    stream << sret << ".store(" << vlhs << " " << opstr << " " << vrhs << ")\n";
+  } else {
+    // Unpack into individual ops.
+    for (int i = 0, lanes = dtype.lanes(); i < lanes; ++i) {
+      std::ostringstream value_temp;
+      if (isalpha(opstr[0])) {
+        value_temp << opstr << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << ", ";
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      } else {
+        value_temp << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << opstr;
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      }
+      PrintVecElemStore_(sret, dtype, i, value_temp.str());
+    }
+  }
+  os << sret << ".load()";
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryExpr_(const std::string &opstr,
+                                              DataType dtype, PrimExpr lhs,
+                                              PrimExpr rhs,
+                                              std::ostream &os) { // NOLINT(*)
+  if (dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryExpr_(opstr, dtype, lhs, rhs, os);
+  } else {
+    PrintVecBinaryOp_(opstr, dtype, lhs, rhs, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryIntrinsic_(
+    const CallNode *op, const char *opstr,
+    std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryIntrinsic_(op, opstr, os);
+  } else {
+    PrintVecBinaryOp_(opstr, op->dtype, op->args[0], op->args[1], os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintCallExtern_(Type ret_type,
+                                              ffi::String global_symbol,
+                                              const ffi::Array<PrimExpr> &args,
+                                              bool skip_first_arg,
+                                              std::ostream &os) { // NOLINT(*)
+  DataType ret_dtype = GetRuntimeDataType(ret_type);
+
+  std::string global_symbol_str = global_symbol;
+  ReplaceAll(global_symbol_str, "::", ".");
+
+  std::vector<std::string> sargs;
+  // when the template arguments occurs at the end, merge them with function
+  // arguments
+  if (global_symbol_str.back() == '>') {
+    auto pos = global_symbol_str.rfind('<');
+    ICHECK(pos != std::string::npos);
+    std::string template_args =
+        global_symbol_str.substr(pos + 1, global_symbol_str.size() - pos - 2);
+    ReplaceAll(template_args, "true", "True");
+    ReplaceAll(template_args, "false", "False");
+    sargs.push_back(template_args);
+
+    global_symbol_str.resize(pos);
+  }
+  const size_t arg_begin = static_cast<size_t>(skip_first_arg);
+  for (size_t i = arg_begin; i < args.size(); ++i) {
+    std::string sarg = PrintExpr_(args[i]);
+    if (ret_dtype.is_fixed_length_vector()) {
+      std::string val = SSAGetID(sarg, args[i].dtype());
+      sargs.push_back(std::move(val));
+    } else {
+      sargs.push_back(sarg);
+    }
+  }
+
+  // Replace "<...>" with "(...)". Nested "<" is not supported
+  {
+    auto pos_left = global_symbol_str.find('<');
+    while (pos_left != std::string::npos) {
+      auto pos_right = global_symbol_str.find('>', pos_left + 1);
+      if (pos_right != std::string::npos) {
+        auto args =
+            global_symbol_str.substr(pos_left + 1, pos_right - pos_left - 1);
+        ReplaceAll(args, "true", "True");
+        ReplaceAll(args, "false", "False");
+        global_symbol_str.replace(pos_left, args.size() + 2, "(" + args + ")");
+      }
+      pos_left = global_symbol_str.find('<');
+    }
+  }
+
+  // Special cases:
+  // Map C math functions to Python/cutedsl equivalents
+  const auto canonicalized_global_symbol_str =
+      CanonicalizeFastmathFunctionName_(global_symbol_str);
+  const bool canonicalized = !canonicalized_global_symbol_str.empty();
+  if (canonicalized) {
+    global_symbol_str = canonicalized_global_symbol_str;
+  }
+
+  // Atomic Functions
+  if (global_symbol_str.substr(0, 6) == "Atomic") {
+    global_symbol_str = "tl." + global_symbol_str;
+    // Convert first argument (Buffer) to pointer for atomic operations
+    if (const BufferLoadNode *load = args[arg_begin].as<BufferLoadNode>()) {
+      ICHECK_EQ(load->indices.size(), 1)
+          << "CodeGenTileLangCuTeDSL only supports flat memory";
+      sargs[0] = GetBufferPtr_(load->buffer.get(), load->indices[0]);
+    }
+  }
+  // some optional template arguments might be ommited, so add names explicitly
+  // for remain arguments
+  if (global_symbol_str == "tl.gemm_ss" || global_symbol_str == "tl.gemm_rs" ||
+      global_symbol_str == "tl.gemm_sr" || global_symbol_str == "tl.gemm_rr") {
+    ICHECK(sargs.size() >= 3);
+    sargs[sargs.size() - 3] = "A_ptr=" + sargs[sargs.size() - 3];
+    sargs[sargs.size() - 2] = "B_ptr=" + sargs[sargs.size() - 2];
+    sargs[sargs.size() - 1] = "C_ptr=" + sargs[sargs.size() - 1];
+  }
+
+  if (ret_dtype.is_fixed_length_vector()) {
+    // maybe simplify this if TensorSSA suppports this OP
+    std::string sret = name_supply_->FreshName("_");
+    PrintIndent();
+    stream << sret << " = tl.make_rmem_tensor((" << ret_dtype.lanes() << ",), ";
+    PrintType(ret_dtype.element_of(), stream);
+    stream << ")\n";
+
+    // Emit a scalar call for each lane.
+    bool has_template_arg = (sargs.size() > args.size() - arg_begin);
+    for (int i = 0; i < ret_dtype.lanes(); ++i) {
+      std::ostringstream scall;
+      scall << global_symbol_str << "(";
+      for (size_t j = 0; j < sargs.size(); ++j) {
+        if (j != 0) {
+          scall << ", ";
+        }
+
+        if (j == 0 && has_template_arg) {
+          scall << sargs[j];
+        } else {
+          PrintVecElemLoad_(
+              sargs[j],
+              args[arg_begin + j - static_cast<size_t>(has_template_arg)]
+                  .dtype(),
+              i, scall);
+        }
+      }
+      if (canonicalized && enable_fastmath_) {
+        if (!sargs.empty()) {
+          scall << ", ";
+        }
+        scall << "fastmath=True";
+      }
+      scall << ")";
+      PrintVecElemStore_(sret, ret_dtype, i, scall.str());
+    }
+    os << sret << ".load()";
+  } else {
+    os << global_symbol_str << "(";
+    for (size_t i = 0; i < sargs.size(); ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+      os << sargs[i];
+    }
+    if (canonicalized && enable_fastmath_) {
+      if (!sargs.empty()) {
+        os << ", ";
+      }
+      os << "fastmath=True";
+    }
+    os << ")";
+  }
+}
+
+std::string CodeGenTileLangCuTeDSL::GetBufferPtr_(const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  const std::string vid = GetVarID(buffer_var);
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  std::string index_str = PrintExpr_(index);
+  return "(" + ptr_str + " + " + index_str + ")";
+}
+
+// The following forms can be returned:
+// (1) vid
+// (2) vid[i]
+// (3) tl.make_tensor_at_offset(...)[0]
+// (4) tl.make_tensor_at_offset(...)
+//
+// Form (4) is needed when the whole tensor is loaded or stored.
+// It's the only form that ends with ")". Using this fact, BufferLoadNode will
+// add ".load()" and BufferStoreNode will add ".store()".
+std::string CodeGenTileLangCuTeDSL::GetBufferRef_(DataType t,
+                                                  const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  std::string scope;
+  if (alloc_storage_scope_.count(buffer_var)) {
+    scope = alloc_storage_scope_.at(buffer_var);
+  }
+  if (scope.empty()) {
+    scope = GetPtrStorageScope(buffer->data);
+  }
+  if (scope == "local.var" || scope.find("local.descriptor") == 0) {
+    return vid;
+  }
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  const std::string index_str = PrintExpr_(index);
+
+  if (t == buffer_element_dtype) {
+    if (is_handle_type_match && buffer_element_dtype.is_scalar() &&
+        (scope == "local" || scope == "shared" || scope == "shared.dyn" ||
+         scope == "shared.barrier")) {
+      // Tensors in these scopes are allocated as one-dimensional, so can be
+      // assessed via "[]" correctly. Other tensors may be multi-dimensional,
+      // and must be assessed via ptr, otherwise CuTeDSL will interpret "[]"
+      // access using its visiting order and layout.
+      return vid + "[" + index_str + "]";
+    } else {
+      std::ostringstream os;
+      os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str
+         << ", (1,), div_by=" << buffer_element_dtype.lanes() << ")";
+      // for vector data types, ".load()" (added by BufferLoadNode) is neeed
+      // instead of "[0]"
+      if (buffer_element_dtype.is_scalar()) {
+        os << "[0]";
+      }
+      return os.str();
+    }
+  } else {
+    const int num = t.bits() * t.lanes();
+    const int den = buffer_element_dtype.bits() * buffer_element_dtype.lanes();
+    ICHECK_EQ(num % den, 0) << "Cannot form view: bitwidth not divisible";
+    int buffer_size = num / den;
+
+    std::ostringstream os;
+    os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str << ", ("
+       << buffer_size << ",), div_by=" << buffer_size << ")";
+    return os.str();
+  }
+}
+
+void CodeGenTileLangCuTeDSL::BindThreadIndex_(const IterVar &iv) {
+  ICHECK(!var_idmap_.count(iv->var.get()));
+
+  auto &thread_tag = iv->thread_tag;
+  ICHECK(thread_tag == "threadIdx.x" || thread_tag == "threadIdx.y" ||
+         thread_tag == "threadIdx.z" || thread_tag == "blockIdx.x" ||
+         thread_tag == "blockIdx.y" || thread_tag == "blockIdx.z");
+
+  // cute.arch.thread_idx() and block_idx() are Int32
+  DataType from_dtype = DataType::Int(32);
+  var_idmap_[iv->var.get()] =
+      CastFromTo_(thread_tag, from_dtype, iv->var.dtype());
+}
+
+void CodeGenTileLangCuTeDSL::PrintStorageSync_(const CallNode *op) {
+  auto args = op->args;
+  const std::string &sync = args[0].as<StringImmNode>()->value;
+  if (sync == "warp") {
+    // do nothing
+  } else if (sync == "shared" || sync == "shared.dyn") {
+    PrintIndent();
+    if (args.size() == 1) {
+      stream << "tl.sync_threads()\n";
+    } else if (args.size() == 2) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ")\n";
+    } else if (args.size() == 3) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto thread_count_ptr = args[2].as<IntImmNode>();
+      ICHECK(thread_count_ptr)
+          << "storage_sync thread_count (args[2]) must be IntImm, got "
+          << args[2]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      auto thread_count = thread_count_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ", " << thread_count
+             << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid number of arguments for storage sync: "
+                 << args.size();
+    }
+  } else if (sync == "global") {
+    LOG(FATAL) << "PrintStorageSync_ for global is not supported for now";
+  } else {
+    LOG(FATAL) << "Unknown storage sync scope: " << sync;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_cutedsl.h b/src/target/codegen_cutedsl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d4edc5382e1cdbbd5bbc1b569cddb4f491fca5e
--- /dev/null
+++ b/src/target/codegen_cutedsl.h
@@ -0,0 +1,102 @@
+/*!
+ * \file target/codegen_cutedsl.h
+ * \brief Utility to generate CuTeDSL code
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+#define TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+
+#include <tvm/target/codegen.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "codegen_py.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
+public:
+  CodeGenTileLangCuTeDSL();
+
+protected:
+  void PrintFuncDecorator_(std::ostream &os) override; // NOLINT(*)
+  void PreFunctionBody_(const PrimFunc &f) override;
+
+protected:
+  void PrintType(DataType t, std::ostream &os) override; // NOLINT(*)
+
+  void VisitExpr_(const BroadcastNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override;                     // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+
+protected:
+  virtual void PrintVecElemLoad_(const std::string &vec, DataType t, int i,
+                                 std::ostream &os); // NOLINT(*)
+  virtual void PrintVecElemStore_(const std::string &vec, DataType t, int i,
+                                  const std::string &value);
+  virtual void PrintVecStore_(const BufferNode *buffer, DataType t,
+                              PrimExpr base, const std::string &value);
+  void PrintVecBinaryOp_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                         PrimExpr rhs,
+                         std::ostream &os); // NOLINT(*)
+  void PrintBinaryExpr_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                        PrimExpr rhs,
+                        std::ostream &os) override; // NOLINT(*)
+  void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                             std::ostream &os) override; // NOLINT(*)
+
+  void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                        const ffi::Array<PrimExpr> &args, bool skip_first_arg,
+                        std::ostream &os) override; // NOLINT(*)
+
+  std::string GetBufferPtr_(const BufferNode *buffer, PrimExpr index);
+  std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                            PrimExpr index) override;
+
+  /*!
+   * \brief Print expr representing the thread tag
+   * \param IterVar iv The thread index to be binded;
+   */
+  virtual void BindThreadIndex_(const IterVar &iv); // NOLINT(*)
+
+  virtual void PrintStorageSync_(const CallNode *op);
+
+  std::string
+  CanonicalizeFastmathFunctionName_(const std::string &func_name) const;
+
+private:
+  // The name of the mbarrier array in shared memory
+  const std::string mbarrier_name_ = "mbarrier";
+
+  std::unordered_map<const VarNode *, IntImm> unroll_factor_;
+
+  std::vector<std::string> eviction_policy_names_ = {
+      "EVICT_NORMAL", "EVICT_FIRST", "EVICT_LAST"};
+
+  // Fastmath configuration (read from PassContext)
+  bool enable_fastmath_ = false;
+};
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index b0b84067ba3b21d62dec5e9fede30bfbe369d563..678512c9d94d9ba81cbda6cf2320e984c178ec7a 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -829,6 +829,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // HIP fallback: regular load
+    const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
+    ICHECK(bl) << "T.__ldg expects a BufferLoad as the first argument.";
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    const BufferNode *buffer = bl->buffer.get();
+    PrimExpr base = bl->indices[0];
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << buffer_ref;
   } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
     ICHECK_EQ(op->args.size(), 6U);
@@ -1256,9 +1266,9 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
       if (op->value < 0) {
         temp << "-";
       }
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_INF_F" : "HIPRT_INF");
+      temp << ((op->dtype.bits() == 32) ? "HUGE_VALF" : "HUGE_VAL");
     } else if (std::isnan(op->value)) {
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_NAN_F" : "HIPRT_NAN");
+      temp << ((op->dtype.bits() == 32) ? "NAN" : "NAN");
     } else {
       temp << std::scientific << op->value;
       if (op->dtype.bits() == 32)
@@ -1378,6 +1388,12 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
   ICHECK(global_symbol.has_value())
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -1411,7 +1427,7 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_py.cc b/src/target/codegen_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa12eef094a36fc490e42c4706062c283e90d272
--- /dev/null
+++ b/src/target/codegen_py.cc
@@ -0,0 +1,715 @@
+/*!
+ * \file codegen_py.cc
+ */
+#include "codegen_py.h"
+#include "codegen_utils.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/ir/name_supply.h>
+
+#include <cctype>
+
+namespace tvm {
+namespace codegen {
+
+void CodeGenTileLangPY::AddFunction(const GlobalVar &gvar, const PrimFunc &f) {
+  RegisterFunction_(gvar, f);
+  auto function_name = GetFunctionName_(gvar);
+
+  // clear previous generated state.
+  InitFuncState_(f);
+
+  PrintFuncDecorator_(stream);
+  PrintFunctionSignature_(function_name, f, stream);
+  stream << ":\n";
+
+  int func_scope = BeginScope();
+  PreFunctionBody_(f);
+  PrintStmt_(f->body);
+  EndScope(func_scope);
+}
+
+std::string CodeGenTileLangPY::Finish() {
+  std::ostringstream code;
+  code << decl_stream.str();
+  code << stream.str();
+  return code.str();
+}
+
+ffi::String CodeGenTileLangPY::GetFunctionName_(const GlobalVar &gvar) {
+  auto it = internal_functions_.find(gvar);
+  ICHECK(it != internal_functions_.end())
+      << "Attempted to find name of " << gvar
+      << ", but no function with this GlobalVar has been declared";
+  return it->second;
+}
+
+void CodeGenTileLangPY::RegisterFunction_(const GlobalVar &gvar,
+                                          const PrimFunc &func) {
+  if (internal_functions_.count(gvar)) {
+    return;
+  }
+
+  auto function_name = [&]() -> ffi::String {
+    if (auto global_symbol =
+            func->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol)) {
+      auto name = global_symbol.value();
+      ICHECK(!func_name_supply_->ContainsName(name))
+          << "Function " << gvar << " must use global symbol " << name
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(name);
+      return name;
+    } else {
+      ICHECK(!func_name_supply_->ContainsName(gvar->name_hint))
+          << "Function " << gvar << " must use name hint " << gvar->name_hint
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(gvar->name_hint);
+      return gvar->name_hint;
+    }
+  }();
+  internal_functions_.insert({gvar, function_name});
+}
+
+void CodeGenTileLangPY::InitFuncState_(const PrimFunc &f) {
+  alloc_storage_scope_.clear();
+  handle_data_type_.clear();
+  CodeGenSourceBase::ClearFuncState();
+  ReserveKeywordsAsUnique_();
+}
+
+void CodeGenTileLangPY::PrintFunctionSignature_(
+    const ffi::String &function_name, const PrimFunc &func,
+    std::ostream &os) { // NOLINT(*)
+  os << "def " << function_name << "(";
+  for (size_t i = 0; i < func->params.size(); ++i) {
+    tir::Var v = func->params[i];
+    if (i > 0) {
+      os << ", ";
+    }
+    os << AllocVarID(v.get());
+  }
+  os << ")";
+
+  // Register handle data type
+  for (const auto &param : func->params) {
+    if (auto *ptr = param->type_annotation.as<PointerTypeNode>()) {
+      if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
+        RegisterHandleType_(param.get(), prim->dtype);
+      }
+    }
+  }
+}
+
+void CodeGenTileLangPY::ReserveKeywordsAsUnique_() {
+  // skip the first underscore, so SSA variable starts from _1
+  name_supply_->ReserveName("_");
+  name_supply_->ReserveName("False");
+  name_supply_->ReserveName("None");
+  name_supply_->ReserveName("True");
+  name_supply_->ReserveName("and");
+  name_supply_->ReserveName("as");
+  name_supply_->ReserveName("assert");
+  name_supply_->ReserveName("async");
+  name_supply_->ReserveName("await");
+  name_supply_->ReserveName("break");
+  name_supply_->ReserveName("class");
+  name_supply_->ReserveName("continue");
+  name_supply_->ReserveName("def");
+  name_supply_->ReserveName("del");
+  name_supply_->ReserveName("elif");
+  name_supply_->ReserveName("else");
+  name_supply_->ReserveName("except");
+  name_supply_->ReserveName("finally");
+  name_supply_->ReserveName("for");
+  name_supply_->ReserveName("from");
+  name_supply_->ReserveName("global");
+  name_supply_->ReserveName("if");
+  name_supply_->ReserveName("import");
+  name_supply_->ReserveName("in");
+  name_supply_->ReserveName("is");
+  name_supply_->ReserveName("lambda");
+  name_supply_->ReserveName("nonlocal");
+  name_supply_->ReserveName("not");
+  name_supply_->ReserveName("or");
+  name_supply_->ReserveName("pass");
+  name_supply_->ReserveName("raise");
+  name_supply_->ReserveName("return");
+  name_supply_->ReserveName("try");
+  name_supply_->ReserveName("while");
+  name_supply_->ReserveName("with");
+  name_supply_->ReserveName("yield");
+
+  name_supply_->ReserveName("void");
+  name_supply_->ReserveName("int");
+  name_supply_->ReserveName("float");
+  name_supply_->ReserveName("double");
+  name_supply_->ReserveName("char");
+  name_supply_->ReserveName("unsigned");
+  name_supply_->ReserveName("short");
+  name_supply_->ReserveName("long");
+
+  name_supply_->ReserveName("cutlass");
+  name_supply_->ReserveName("cute");
+  name_supply_->ReserveName("tl");
+}
+
+void CodeGenTileLangPY::PrintSSAAssign(const std::string &target,
+                                       const std::string &src, DataType t) {
+  stream << target << " = " << RemoveOutermostParentheses(src) << "\n";
+}
+
+void CodeGenTileLangPY::PrintType(DataType type,
+                                  std::ostream &os) { // NOLINT(*)
+  if (type.is_float()) {
+    if (type.bits() == 16 || type.bits() == 32 || type.bits() == 64) {
+      os << "float";
+    } else {
+      LOG(FATAL) << "Cannot convert float" << type.bits() << " to Python type";
+    }
+  } else if (type.is_uint()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert uint" << type.bits() << " to Python type";
+    }
+  } else if (type.is_int()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert int" << type.bits() << " to Python type";
+    }
+  } else {
+    LOG(FATAL) << "Cannot convert type " << type << " to Python type";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const VarNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << GetVarID(op);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const IntImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype == DataType::Bool()) {
+    os << (op->value ? "True" : "False");
+  } else {
+    std::ostringstream temp;
+    temp << op->value;
+    MarkConst(temp.str());
+    os << temp.str();
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const FloatImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32: {
+    std::ostringstream temp;
+    temp << "float.fromhex('" << std::hexfloat << op->value << "')";
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  case 16: {
+    PrintType(op->dtype, os);
+    os << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const StringImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  EscapeStringLiteral_(op->value, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CastNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  std::stringstream value;
+  PrintExpr_(op->value, value);
+  os << CastFromTo_(value.str(), op->value.dtype(), op->dtype);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const AddNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("+", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const SubNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("-", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MulNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("*", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const DivNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    PrintBinaryExpr_("/", op->dtype, op->a, op->b, os);
+  }
+}
+void CodeGenTileLangPY::VisitExpr_(const ModNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK(op->dtype.is_int() || op->dtype.is_uint() || op->dtype.is_float())
+      << "Expected floating point or integer dtype in Mod, but got "
+      << op->dtype;
+  PrintBinaryExpr_("%", op->dtype, op->a, op->b, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const MinNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MaxNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("max", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const EQNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("==", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("!=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const AndNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("and", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const OrNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("or", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NotNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(not ";
+  PrintExpr_(op->a, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const SelectNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(";
+  PrintExpr_(op->true_value, os);
+  os << " if ";
+  PrintExpr_(op->condition, os);
+  os << " else ";
+  PrintExpr_(op->false_value, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const RampNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  int lanes = op->dtype.lanes();
+  os << "(";
+  for (int i = 0; i < lanes; i++) {
+    os << "(" << PrintExpr_(op->base) << ")"
+       << "+(" << PrintExpr_(op->stride) << "*" << i << ")";
+    if (i != lanes - 1)
+      os << ", ";
+  }
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CallNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (auto opt_call_op = op->op.as<Op>()) {
+    const auto &call_op = opt_call_op.value();
+
+    if (op->op.same_as(builtin::ret())) {
+      os << "return " << RemoveOutermostParentheses(PrintExpr_(op->args[0]));
+    } else if (op->op.same_as(builtin::continue_loop())) {
+      os << "continue";
+    } else if (op->op.same_as(builtin::break_loop())) {
+      os << "break";
+    } else if (op->op.same_as(builtin_call_extern_) ||
+               op->op.same_as(builtin_call_pure_extern_)) {
+      ICHECK_GE(op->args.size(), 1U);
+      auto func = Downcast<StringImm>(op->args[0]);
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), func->value,
+                       op->args, true, os);
+    } else if (op_attr_global_symbol_.count(call_op)) {
+      // call extern if the op itself have a global symbol.
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)),
+                       op_attr_global_symbol_[call_op], op->args, false, os);
+    } else if (op->op.same_as(builtin::large_uint_imm())) {
+      ICHECK_EQ(op->args.size(), 2U);
+      uint64_t low =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
+      uint64_t high =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
+      uint64_t val = (high << 32U) | low;
+
+      if (op->dtype == DataType::UInt(32)) {
+        std::ostringstream temp;
+        temp << val;
+        MarkConst(temp.str());
+        os << temp.str();
+      } else {
+        PrintType(op->dtype, os);
+        os << "(" << val << ")";
+      }
+    } else if (op->op.same_as(builtin::bitwise_and())) {
+      PrintBinaryIntrinsic_(op, "&", os);
+    } else if (op->op.same_as(builtin::bitwise_or())) {
+      PrintBinaryIntrinsic_(op, "|", os);
+    } else if (op->op.same_as(builtin::bitwise_xor())) {
+      PrintBinaryIntrinsic_(op, "^", os);
+    } else if (op->op.same_as(builtin::bitwise_not())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "~";
+      PrintExpr_(op->args[0], os);
+    } else if (op->op.same_as(builtin::shift_left())) {
+      PrintBinaryIntrinsic_(op, "<<", os);
+    } else if (op->op.same_as(builtin::shift_right())) {
+      PrintBinaryIntrinsic_(op, ">>", os);
+    } else if (op->op.same_as(builtin::if_then_else())) {
+
+      std::string cond = PrintExpr_(op->args[0]);
+      std::string true_val = PrintExpr_(op->args[1]);
+      std::string false_val = PrintExpr_(op->args[2]);
+      os << "(" << true_val << " if " << cond << " else " << false_val << ")";
+    } else if (op->op.same_as(builtin::isnullptr())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " is None)";
+    } else if (op->op.same_as(builtin::isnan())) {
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " != ";
+      PrintExpr_(op->args[0], os);
+      os << ")";
+    } else {
+      LOG(FATAL) << "Unresolved call " << op->op;
+    }
+  } else if (auto opt = op->op.as<GlobalVar>()) {
+    const auto &gvar = opt.value();
+    auto callee_name = GetFunctionName_(gvar);
+    PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), callee_name, op->args,
+                     false, os);
+  } else {
+    LOG(FATAL)
+        << "CodeGenTileLangPY: Unknown operation " << op->op
+        << " is neither a recognized built-in, "
+        << "nor a GlobalVar reference to another function in the IRModule";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const BufferLoadNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferLoadNode";
+  std::string ref = GetBufferRef_(op->dtype, op->buffer.get(), index);
+  os << ref;
+}
+
+void CodeGenTileLangPY::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferStoreNode";
+  std::string value = PrintExpr_(op->value);
+  std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+  PrintIndent();
+  stream << ref << " = " << RemoveOutermostParentheses(value) << "\n";
+}
+
+void CodeGenTileLangPY::VisitStmt_(const DeclBufferNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const LetStmtNode *op) {
+  std::string value = PrintExpr_(op->value);
+  PrintIndent();
+  stream << AllocVarID(op->var.get()) << " = " << value << "\n";
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+
+  PrintIndent();
+  size_t constant_size = op->ConstantAllocationSize();
+  ICHECK_GT(constant_size, 0)
+      << "Can only handle constant size stack allocation for now";
+
+  auto scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  stream << vid << " = [None] * " << constant_size << "\n";
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AttrStmtNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const ForNode *op) {
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in range(";
+  if (is_zero(op->min)) {
+    PrintExpr_(op->extent, stream);
+  } else {
+    PrintExpr_(op->min, stream);
+    stream << ", ";
+    PrimExpr upper_bound = arith::Analyzer().Simplify(op->extent + op->min);
+    PrintExpr_(upper_bound, stream);
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const WhileNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "while " << RemoveOutermostParentheses(cond) << ":\n";
+  int while_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(while_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  PrintStmt_(op->then_case);
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const SeqStmtNode *op) {
+  for (Stmt stmt : op->seq) {
+    PrintStmt_(stmt);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+
+  std::string vid = PrintExpr_(op->value);
+  if (!vid.empty()) {
+    PrintIndent();
+    stream << vid << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AssertStmtNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  if (const auto *str = op->message.as<StringImmNode>()) {
+    stream << "assert " << cond << ", ";
+    EscapeStringLiteral_(str->value, stream);
+    stream << "\n";
+  } else {
+    stream << "assert " << cond << "\n";
+  }
+  PrintStmt_(op->body);
+}
+
+std::string CodeGenTileLangPY::CastFromTo_(const std::string &value,
+                                           DataType from, DataType target) {
+  if (from == target)
+    return value;
+  std::ostringstream os;
+  PrintType(target, os);
+  os << "(" << value << ")";
+  return os.str();
+}
+
+void CodeGenTileLangPY::PrintBinaryExpr_(const std::string &opstr,
+                                         DataType dtype, PrimExpr lhs,
+                                         PrimExpr rhs,
+                                         std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(dtype.lanes(), 1);
+  if (isalpha(opstr[0]) && opstr != "and" && opstr != "or") {
+    os << opstr << '(';
+    PrintExpr_(lhs, os);
+    os << ", ";
+    PrintExpr_(rhs, os);
+    os << ')';
+  } else {
+    os << '(';
+    PrintExpr_(lhs, os);
+    os << ' ' << opstr << ' ';
+    PrintExpr_(rhs, os);
+    os << ')';
+  }
+}
+
+void CodeGenTileLangPY::PrintBinaryIntrinsic_(const CallNode *op,
+                                              const char *opstr,
+                                              std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->dtype.lanes(), 1);
+  ICHECK_EQ(op->args.size(), 2U);
+  os << '(';
+  PrintExpr_(op->args[0], os);
+  os << ' ' << opstr << ' ';
+  PrintExpr_(op->args[1], os);
+  os << ')';
+}
+
+void CodeGenTileLangPY::PrintCallExtern_(Type ret_type,
+                                         ffi::String global_symbol,
+                                         const ffi::Array<PrimExpr> &args,
+                                         bool skip_first_arg,
+                                         std::ostream &os) { // NOLINT(*)
+  os << global_symbol << "(";
+  for (size_t i = static_cast<size_t>(skip_first_arg); i < args.size(); ++i) {
+    PrintExpr_(args[i], os);
+    if (i < args.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << ")";
+}
+
+// Print a reference expression to a buffer.
+std::string CodeGenTileLangPY::GetBufferRef_(DataType t,
+                                             const BufferNode *buffer,
+                                             PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  DataType buffer_element_dtype = buffer->dtype;
+
+  ICHECK(HandleTypeMatch_(buffer_var, buffer_element_dtype));
+  ICHECK_EQ(t, buffer_element_dtype);
+
+  std::string index_str = PrintExpr_(index);
+  return vid + "[" + index_str + "]";
+}
+
+void CodeGenTileLangPY::RegisterHandleType_(const VarNode *buf_var,
+                                            DataType t) {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end()) {
+    handle_data_type_[buf_var] = t;
+  } else {
+    ICHECK(it->second == t) << "conflicting buf var type";
+  }
+}
+
+bool CodeGenTileLangPY::HandleTypeMatch_(const VarNode *buf_var,
+                                         DataType t) const {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end())
+    return false;
+  return it->second == t;
+}
+
+void CodeGenTileLangPY::EscapeStringLiteral_(const std::string &s,
+                                             std::ostream &os) {
+  os << '"';
+  for (unsigned char c : s) {
+    switch (c) {
+    case '\\':
+      os << "\\\\";
+      break;
+    case '"':
+      os << "\\\"";
+      break;
+    case '\n':
+      os << "\\n";
+      break;
+    case '\r':
+      os << "\\r";
+      break;
+    case '\t':
+      os << "\\t";
+      break;
+    case '\f':
+      os << "\\f";
+      break;
+    case '\b':
+      os << "\\b";
+      break;
+    default:
+      // Handle non-printable and non-ASCII characters
+      if (c < 32 || c == 127) {
+        // Output as \xHH
+        os << "\\x";
+        const char hex[] = "0123456789abcdef";
+        os << hex[(c >> 4) & 0xF];
+        os << hex[c & 0xF];
+      } else {
+        os << c;
+      }
+      break;
+    }
+  }
+  os << '"';
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_py.h b/src/target/codegen_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..431fe933d308527cd50dbd151bcefa7713d2b775
--- /dev/null
+++ b/src/target/codegen_py.h
@@ -0,0 +1,255 @@
+/*!
+ * \file codegen_py.h
+ * \brief Common utilities to generate simple Python code.
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_PY_H_
+#define TVM_TL_TARGET_CODEGEN_PY_H_
+
+#include <tvm/ir/op.h>
+#include <tvm/target/codegen.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <string>
+#include <unordered_map>
+
+// from tvm/src/
+#include "target/source/codegen_source_base.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+using namespace tir;
+/*!
+ * \brief A base class to generate simple Python code.
+ */
+class CodeGenTileLangPY
+    : public ExprFunctor<void(const PrimExpr &, std::ostream &)>,
+      public StmtFunctor<void(const Stmt &)>,
+      public CodeGenSourceBase {
+public:
+  /*!
+   * \brief Add the function definition to the generated module.
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   */
+  virtual void AddFunction(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Finalize the compilation and return the code.
+   * \return The code.
+   */
+  virtual std::string Finish();
+
+protected:
+  /*!
+   * \brief Get the name of a declared function
+   * \param gvar The GlobalVar of the function
+   * \returns The string name of the function
+   */
+  ffi::String GetFunctionName_(const GlobalVar &gvar);
+
+  /*!
+   * \brief Reserve the function name in the generated module.
+   *
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   * \param whether to append return 0 in the end.
+   */
+  virtual void RegisterFunction_(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Initialize codegen state for generating f.
+   * \param f The function to be compiled.
+   */
+  virtual void InitFuncState_(const PrimFunc &f);
+
+  /*! \brief Print the function signature before ":"
+   * \param function_name The name of the function
+   * \param func The function whose signature should be printed
+   * \param os The output stream
+   */
+  virtual void PrintFunctionSignature_(const ffi::String &function_name,
+                                       const PrimFunc &func,
+                                       std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print the function decorator
+   * \param os The output stream
+   */
+  virtual void PrintFuncDecorator_(std::ostream &os) {} // NOLINT(*)
+
+  /*!
+   * \brief Insert statement before function body.
+   * \param f The function to be compiled.
+   */
+  virtual void PreFunctionBody_(const PrimFunc &f) {}
+
+protected:
+  /*! \brief reserves common Python keywords */
+  void ReserveKeywordsAsUnique_();
+
+  void PrintSSAAssign(const std::string &target, const std::string &src,
+                      DataType t) override;
+
+protected:
+  /*!
+   * \brief Print Type representation of type type.
+   * \param t The type representation.
+   * \param os The output stream
+   */
+  void PrintType(DataType type, std::ostream &os) override; // NOLINT(*)
+
+  /*!
+   * \brief Print the Stmt n to CodeGenTileLangPY->stream
+   * \param n The statement to be printed.
+   */
+  void PrintStmt_(const Stmt &n) { VisitStmt(n); }
+  /*!
+   * \brief Print the expression n into os
+   * \param n The expression to be printed.
+   * \param os The output stream
+   */
+  void PrintExpr_(const PrimExpr &n, std::ostream &os) { // NOLINT(*)
+    VisitExpr(n, os);
+  }
+  /*!
+   * \brief Same as PrintExpr_, but simply returns result string
+   * \param n The expression to be printed.
+   */
+  std::string PrintExpr_(const PrimExpr &n) {
+    std::ostringstream os;
+    PrintExpr_(n, os);
+    return os.str();
+  }
+
+  // expression
+  void VisitExpr_(const VarNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const IntImmNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const StringImmNode *op,
+                  std::ostream &os) override;                       // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const AddNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SubNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MulNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const ModNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const EQNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const AndNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const OrNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NotNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SelectNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const RampNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  // statment
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const DeclBufferNode *op) override;
+  void VisitStmt_(const LetStmtNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const WhileNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const SeqStmtNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+  void VisitStmt_(const AssertStmtNode *op) override;
+
+protected:
+  // Get a string of type casting
+  virtual std::string CastFromTo_(const std::string &value, DataType from,
+                                  DataType target);
+
+  virtual void PrintBinaryExpr_(const std::string &opstr, DataType dtype,
+                                PrimExpr lhs, PrimExpr rhs,
+                                std::ostream &os); // NOLINT(*)
+  virtual void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                                     std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print external function call.
+   * \param ret_type The return type.
+   * \param global_symbol The symbolc of the target function.
+   * \param args The arguments to the function.
+   * \param skip_first_arg Whether to skip the first arguments.
+   * \param os The output stream.
+   */
+  virtual void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                                const ffi::Array<PrimExpr> &args,
+                                bool skip_first_arg,
+                                std::ostream &os); // NOLINT(*)
+
+  // Print reference to a buffer as type t in index.
+  virtual std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                                    PrimExpr index);
+
+  /*!
+   * \brief Register the data type of buf_var
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  void RegisterHandleType_(const VarNode *buf_var, DataType t);
+
+  /*!
+   * \brief If buffer is allocated as type t.
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  bool HandleTypeMatch_(const VarNode *buf_var, DataType t) const;
+
+protected:
+  /*! \brief the storage scope of allocation */
+  std::unordered_map<const VarNode *, std::string> alloc_storage_scope_;
+
+  /*! \brief Record of ops that have pre-defined global symbol. */
+  OpAttrMap<TGlobalSymbol> op_attr_global_symbol_ =
+      Op::GetAttrMap<TGlobalSymbol>("TGlobalSymbol");
+
+  // cache commonly used ops
+  const Op &builtin_call_extern_ = builtin::call_extern();
+  const Op &builtin_call_pure_extern_ = builtin::call_pure_extern();
+
+private:
+  /*! \brief the data type of allocated buffers */
+  std::unordered_map<const VarNode *, DataType> handle_data_type_;
+
+  /* \brief Map of GlobalVar to their symbol.
+   *
+   * For externally-exposed functions, this is given by the
+   * tvm::attr::kTarget attribute of the PrimFunc.  For internal
+   * functions, this is the name of the function's GlobalVar, possibly
+   * altered to prevent duplicate names.
+   */
+  std::unordered_map<GlobalVar, ffi::String> internal_functions_;
+
+  /* \brief Name supply to generate unique function names */
+  NameSupply func_name_supply_;
+
+  /*!
+   * \brief Escape a string to be a valid Python double-quoted string literal.
+   * \param s The input string to escape.
+   * \param os The output stream to write the escaped string to.
+   */
+  void EscapeStringLiteral_(const std::string &s, std::ostream &os);
+};
+
+} // namespace codegen
+} // namespace tvm
+#endif // TVM_TL_TARGET_CODEGEN_PY_H_
diff --git a/src/target/codegen_utils.cc b/src/target/codegen_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75d038d3a74047e2afd6b28f25a60b76a21e01d3
--- /dev/null
+++ b/src/target/codegen_utils.cc
@@ -0,0 +1,41 @@
+/*!
+ * \file target/codegen_utils.cc
+ * \brief Shared utility functions for code generation
+ */
+
+#include "codegen_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+bool CheckOutermostParenthesesMatch(const std::string &s) {
+  if (!s.empty() && s.front() == '(' && s.back() == ')') {
+    size_t len = s.size();
+    int n_unmatched = 0;
+    for (size_t i = 0; i < len; ++i) {
+      if (s[i] == '(') {
+        n_unmatched++;
+      } else if (s[i] == ')') {
+        n_unmatched--;
+      }
+      if (n_unmatched < 0) {
+        return false;
+      }
+      if (n_unmatched == 0) {
+        return i == len - 1;
+      }
+    }
+  }
+  return false;
+}
+
+std::string RemoveOutermostParentheses(const std::string &s) {
+  if (CheckOutermostParenthesesMatch(s)) {
+    return s.substr(1, s.size() - 2);
+  } else {
+    return s;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_utils.h b/src/target/codegen_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ef52d4b105cbd548dca9cce67a338ab4325cb9a
--- /dev/null
+++ b/src/target/codegen_utils.h
@@ -0,0 +1,33 @@
+/*!
+ * \file target/codegen_utils.h
+ * \brief Shared utility functions for code generation
+ */
+
+#ifndef TVM_TARGET_CODEGEN_UTILS_H_
+#define TVM_TARGET_CODEGEN_UTILS_H_
+
+#include <string>
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Check if the outermost parentheses match
+ * \param s The input string
+ * \return true if the first character is '(' and the last character is ')'
+ *         and they form a matching pair
+ */
+bool CheckOutermostParenthesesMatch(const std::string &s);
+
+/*!
+ * \brief Remove outermost parentheses if they match
+ * \param s The input string
+ * \return The string with outermost parentheses removed if they match,
+ *         otherwise return the original string
+ */
+std::string RemoveOutermostParentheses(const std::string &s);
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TARGET_CODEGEN_UTILS_H_
diff --git a/src/target/rt_mod_cuda.cc b/src/target/rt_mod_cuda.cc
index bb69170fe54db02c80b45fc974d4d58c9bf33833..a5e9b29908d8cf55e2636e454861c6689c5d8234 100644
--- a/src/target/rt_mod_cuda.cc
+++ b/src/target/rt_mod_cuda.cc
@@ -2,6 +2,7 @@
 #include "runtime/cuda/cuda_module.h"
 #include "runtime/pack_args.h"
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
 
 namespace tvm {
 namespace codegen {
@@ -24,7 +25,11 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
     }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
@@ -62,7 +67,10 @@ ffi::Module BuildTileLangCUDA(IRModule mod, Target target) {
   std::string ptx;
   if (const auto f =
           ffi::Function::GetGlobal("tilelang_callback_cuda_compile")) {
-    ptx = (*f)(code, target).cast<std::string>();
+    // Fetch current pass context config and pass into the compile callback
+    tvm::transform::PassContext pass_ctx =
+        tvm::transform::PassContext::Current();
+    ptx = (*f)(code, target, pass_ctx->config).cast<std::string>();
     if (ptx[0] != '/')
       fmt = "cubin";
   } else {
diff --git a/src/target/rt_mod_cutedsl.cc b/src/target/rt_mod_cutedsl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2b6d05d110d6c150f5f47b897c58e1873767894
--- /dev/null
+++ b/src/target/rt_mod_cutedsl.cc
@@ -0,0 +1,69 @@
+#include "codegen_cutedsl.h"
+#include "runtime/cuda/cuda_module.h"
+#include "runtime/pack_args.h"
+#include <tvm/ffi/reflection/registry.h>
+
+namespace tvm {
+namespace codegen {
+
+static std::unordered_map<std::string, runtime::FunctionInfo>
+ExtractFuncInfo(const IRModule &mod) {
+  std::unordered_map<std::string, runtime::FunctionInfo> fmap;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<tir::PrimFuncNode>())
+        << "Can only lower IR Module with PrimFuncs";
+    auto f = Downcast<tir::PrimFunc>(kv.second);
+
+    runtime::FunctionInfo info;
+    for (size_t i = 0; i < f->params.size(); ++i) {
+      if (f->params[i]->dtype.is_handle()) {
+        auto ptr = f->params[i]->type_annotation.as<PointerTypeNode>();
+        if (ptr && ptr->storage_scope == "grid_constant") {
+          info.arg_types.push_back(DataType(runtime::kDLGridConstant, 64, 1));
+          continue;
+        }
+      }
+      info.arg_types.push_back(f->params[i].dtype());
+    }
+    if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
+            tir::attr::kKernelLaunchParams)) {
+      for (const auto &tag : opt.value()) {
+        info.launch_param_tags.push_back(tag);
+      }
+    }
+    auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
+    fmap[static_cast<std::string>(global_symbol.value())] = info;
+  }
+  return fmap;
+}
+
+ffi::Module BuildTileLangCuTeDSLWithoutCompile(IRModule mod, Target target) {
+  CodeGenTileLangCuTeDSL cg;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<PrimFuncNode>())
+        << "CodeGenTileLangCuTeDSL: Can only take PrimFunc";
+    auto gvar = Downcast<GlobalVar>(kv.first);
+    auto f = Downcast<PrimFunc>(kv.second);
+    auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch);
+    cg.AddFunction(gvar, f);
+  }
+
+  std::string code = cg.Finish();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cutedsl_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
+  }
+  return runtime::CUDAModuleCreate("ptx", "ptx", ExtractFuncInfo(mod), code);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_cutedsl_without_compile",
+                        BuildTileLangCuTeDSLWithoutCompile);
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/rt_mod_hip.cc b/src/target/rt_mod_hip.cc
index 50991d631fb49a6d1af2b4f6b869e78651922a7b..1e5c689c6e1f81a4723ab25b642498225f9e682a 100644
--- a/src/target/rt_mod_hip.cc
+++ b/src/target/rt_mod_hip.cc
@@ -35,7 +35,11 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
     }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index 82eeccfda5a791dbe6471ccb594cbc4e64249498..f6096cc9d5d280b9e3e031e1149d4e2098966766 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -12,7 +12,11 @@ using cutlass::bfloat16_t;
 using cutlass::half_t;
 
 #define TL_DEVICE __forceinline__ __device__
-
+#define TL_NOT_IMPLEMENTED()                                                   \
+  {                                                                            \
+    printf("%s not implemented\n", __PRETTY_FUNCTION__);                       \
+    asm volatile("brkpt;\n");                                                  \
+  }
 template <typename T> struct normalize_atomic_type {
   using type = T;
 };
@@ -42,98 +46,284 @@ template <> TL_DEVICE __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
 #endif
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMax(T1 &ref, T2 val,
+TL_DEVICE void AtomicMax(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    // There is no implementation of atomicMax for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMaxRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMin(T1 &ref, T2 val,
+TL_DEVICE void AtomicMin(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    // There is no implementation of atomicMin for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMinRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 890))
 template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+    } else {
+      // Since atomic ref do not support memory order, we need to inline ptx
+      // code here for each situation
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      }
+    }
   } else {
-    cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+    atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
   }
 }
+#else
+template <typename T1, typename T2>
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
+                         int memory_order = int(cuda::memory_order_relaxed)) {
+  using NT1 = typename normalize_atomic_type<T1>::type;
+  (void)memory_order;
+  atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
+}
+#endif
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      return static_cast<T1>(
+          atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    } else {
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(*reinterpret_cast<__half *>(&ret_val_cast));
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(
+            *reinterpret_cast<__nv_bfloat16 *>(&ret_val_cast));
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
@@ -456,16 +646,66 @@ AtomicAddx4Ret(float *ref, float *val,
     return ret_val;
   }
 }
+#else
+TL_DEVICE void AtomicAddx2(float *ref, float *val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+}
+
+TL_DEVICE float2
+AtomicAddx2Ret(float *ref, float *val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+TL_DEVICE void AtomicAddx4(float *ref, float *val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
+}
+
+TL_DEVICE float4
+AtomicAddx4Ret(float *ref, float *val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
 #endif
 
-template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> aref(ref);
+template <typename T> TL_DEVICE T AtomicLoad(T *ref, int memory_order) {
+#if CUDART_VERSION >= 11080
+  cuda::atomic_ref<T, cuda::thread_scope_device> aref(*ref);
   return aref.load(cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicStore(T1 &ref, T2 value, int memory_order) {
+TL_DEVICE void AtomicStore(T1 *ref, T2 value, int memory_order) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(ref);
+#if CUDART_VERSION >= 11080
+  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*ref);
   aref.store(cuda_cast<NT1>(value), cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
index b92fc73bf3e049ed29812d8397797a85b81fd6da..bf2a5100bd4d390fd7c91a3f4b601b03b6781f22 100644
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -127,6 +127,16 @@ TL_DEVICE int4_t make_int4(signed char x0, signed char x1, signed char x2,
   return result;
 }
 
+TL_DEVICE int4_t make_int4(short x0, short x1, short y0, short y1, short z0,
+                           short z1, short w0, short w1) {
+  int4_t result;
+  *((short2 *)&result.x) = make_short2(x0, x1);
+  *((short2 *)&result.y) = make_short2(y0, y1);
+  *((short2 *)&result.z) = make_short2(z0, z1);
+  *((short2 *)&result.w) = make_short2(w0, w1);
+  return result;
+}
+
 // Pack eight int values.
 TL_DEVICE longlong4 make_longlong4(int x0, int x1, int y0, int y1, int z0,
                                    int z1, int w0, int w1) {
diff --git a/src/tl_templates/cuda/copy.h b/src/tl_templates/cuda/copy.h
index 1dd53843431a50e8d4343cb48c07054d31df4b0c..0fa7b9d91e30f3299cdf675c7eaaa55d5b1594c2 100644
--- a/src/tl_templates/cuda/copy.h
+++ b/src/tl_templates/cuda/copy.h
@@ -26,7 +26,8 @@ template <int N> TL_DEVICE void cp_async_wait() {
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
+TL_DEVICE void cp_async_gs(void const *const smem_addr,
+                           void const *global_ptr) {
   static_assert(N == 16 || N == 8 || N == 4);
   unsigned int addr = smem_ptr_to_uint(smem_addr);
   if constexpr (N == 16) {
@@ -37,7 +38,7 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.cg.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -46,13 +47,13 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.ca.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
-                                       void *global_ptr, bool cond) {
+                                       void const *global_ptr, bool cond) {
   static_assert(N == 16 || N == 8 || N == 4);
   int bytes = cond ? N : 0;
   unsigned int addr = smem_ptr_to_uint(smem_addr);
@@ -64,7 +65,7 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.cg.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -73,7 +74,7 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.ca.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   }
 }
 
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index c4047c349741ba5484d47a74b0b424f6ef1b1668..82d0cca260e225dc6a1128cac1edeac9435d5293 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -5,6 +5,7 @@
 
 namespace tl {
 
+// 256-bit load for longlong4
 __device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
   longlong4 ret;
   asm volatile("ld.global.v4.s64 {%0, %1, %2, %3}, [%4];"
@@ -13,13 +14,18 @@ __device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
   return ret;
 }
 
-__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
-  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+// 256-bit load for ulonglong4
+__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
+  ulonglong4 ret;
+  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
+               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
+               : "l"(ptr));
+  return ret;
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
+// Generic 256-bit load for FP8 types (returns ulonglong4)
+template <typename T>
+__device__ __forceinline__ ulonglong4 ld_global_256(const T *ptr) {
   ulonglong4 ret;
   asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
                : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
@@ -27,6 +33,22 @@ __device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
   return ret;
 }
 
+// 256-bit store for longlong4
+__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
+  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
+
+// 256-bit store for ulonglong4 with non-const reference
+__device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
+                                              ulonglong4 &val) {
+  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
+
+// 256-bit store for ulonglong4 with const reference
 // must be const &val, otherwise the compiler will generate a temporary variable
 // and compilation will fail if we have st_global_256(ptr, ld_global_256(ptr))
 __device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
@@ -36,20 +58,22 @@ __device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
                : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e4_32_t *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
-  return ret;
+// Generic 256-bit store for FP8 types
+template <typename T>
+__device__ __forceinline__ void st_global_256(T *ptr, const ulonglong4 &val) {
+  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
 
-__device__ __forceinline__ void st_global_256(fp8_e4_32_t *ptr,
-                                              fp8_e4_32_t &val8) {
-  ulonglong4 &val = *((ulonglong4 *)&val8);
+// Generic 256-bit store for FP8 types with non-const reference
+template <typename T>
+__device__ __forceinline__ void st_global_256(T *ptr, T &val) {
+  ulonglong4 &val_u64 = *((ulonglong4 *)&val);
   asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
                :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+               : "l"(ptr), "l"(val_u64.x), "l"(val_u64.y), "l"(val_u64.z),
+                 "l"(val_u64.w));
 }
 
 __device__ __forceinline__ unsigned long long
@@ -95,38 +119,38 @@ __device__ __forceinline__ void tcgen05_ld_core(uint32_t const &tmem_start_col,
   }
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp32bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp32bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp32bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp64bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp64bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp64bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp128bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp128bNx, 6, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp128bNx<pack16>, 6, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp256bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp256bNx, 5, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp256bNx<pack16>, 5, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
diff --git a/src/tl_templates/cuda/copy_sm90.h b/src/tl_templates/cuda/copy_sm90.h
index b8b174dc4c2b293183bcc397ec4871d5815b4b36..0b51450b3148154de2c65916f25b65fabd091f97 100644
--- a/src/tl_templates/cuda/copy_sm90.h
+++ b/src/tl_templates/cuda/copy_sm90.h
@@ -15,14 +15,14 @@ enum class CacheHintSm90 : uint64_t {
 };
 
 template <typename BarrierType = uint64_t>
-TL_DEVICE void tma_load(void *smem_ptr, void *gmem_ptr, BarrierType &smem_mbar,
-                        uint32_t size) {
+TL_DEVICE void tma_load(void *smem_ptr, void const *gmem_ptr,
+                        BarrierType &smem_mbar, uint32_t size) {
   uint32_t smem_int_mbar =
       smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::"
                "bytes [%0], [%1], %2, [%3]; \n" ::"r"(smem_int_ptr),
-               "l"(gmem_ptr), "r"(size), "r"(smem_int_mbar)
+               "l"((void const *)gmem_ptr), "r"(size), "r"(smem_int_mbar)
                :);
 }
 
diff --git a/src/tl_templates/cuda/cuda_fp4.h b/src/tl_templates/cuda/cuda_fp4.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3f56622f432f36bd6f028adf8b0fa5dbdb55c68
--- /dev/null
+++ b/src/tl_templates/cuda/cuda_fp4.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#include "common.h"
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#include <cuda_fp4.h>
+
+// Wrapper for __nv_fp4_e2m1 with implicit conversions
+struct fp4_e2_t {
+  __nv_fp4_storage_t __x;
+
+  TL_DEVICE fp4_e2_t() = default;
+
+  // Constructor from __nv_fp4_e2m1
+  TL_DEVICE fp4_e2_t(__nv_fp4_e2m1 x) : __x(x.__x) {}
+
+  // Constructor from storage type
+  TL_DEVICE fp4_e2_t(__nv_fp4_storage_t x) : __x(x) {}
+
+  // Constructor from float
+  TL_DEVICE explicit fp4_e2_t(float x) {
+    __nv_fp4_e2m1 tmp(x);
+    __x = tmp.__x;
+  }
+
+  // Conversion to __nv_fp4_e2m1
+  TL_DEVICE operator __nv_fp4_e2m1() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return tmp;
+  }
+
+  // Conversion to float
+  TL_DEVICE operator float() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return float(tmp);
+  }
+
+  // Implicit conversion to half_t (cutlass::half_t)
+  TL_DEVICE operator half_t() const { return half_t(float(*this)); }
+
+  // Implicit conversion to __half
+  TL_DEVICE operator __half() const { return __half(float(*this)); }
+};
+
+using fp4_e2x2_t = __nv_fp4x2_e2m1;
+using fp4_e2x4_t = __nv_fp4x4_e2m1;
+
+struct fp4_e2x8_t {
+  fp4_e2_t data[8];
+};
+
+struct fp4_e2x16_t {
+  fp4_e2_t data[16];
+};
+
+struct __CUDA_ALIGN__(1) fp4_e2_2_t {
+  fp4_e2_t x;
+  fp4_e2_t y;
+};
+
+struct __CUDA_ALIGN__(2) fp4_e2_4_t {
+  fp4_e2_t x;
+  fp4_e2_t y;
+  fp4_e2_t z;
+  fp4_e2_t w;
+};
+
+struct __CUDA_ALIGN__(4) fp4_e2_8_t {
+  fp4_e2_4_t x;
+  fp4_e2_4_t y;
+};
+
+struct __CUDA_ALIGN__(8) fp4_e2_16_t {
+  fp4_e2_8_t x;
+  fp4_e2_8_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp4_e2_32_t {
+  fp4_e2_16_t x;
+  fp4_e2_16_t y;
+
+  TL_DEVICE fp4_e2_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp4_e2_8_t *)&rhs.x;
+    x.y = *(fp4_e2_8_t *)&rhs.y;
+    y.x = *(fp4_e2_8_t *)&rhs.z;
+    y.y = *(fp4_e2_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
+struct __CUDA_ALIGN__(32) fp4_e2_64_t {
+  fp4_e2_32_t x;
+  fp4_e2_32_t y;
+};
+
+// Pack two fp4_e2_t values.
+TL_DEVICE fp4_e2_2_t make_fp4_e2_2_t(fp4_e2_t x, fp4_e2_t y) {
+  fp4_e2_2_t result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+
+// Pack four fp4_e2_t values.
+TL_DEVICE fp4_e2_4_t make_fp4_e2_4_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3) {
+  fp4_e2_4_t result;
+  result.x = x0;
+  result.y = x1;
+  result.z = x2;
+  result.w = x3;
+  return result;
+}
+
+// Pack eight fp4_e2_t values.
+TL_DEVICE fp4_e2_8_t make_fp4_e2_8_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                     fp4_e2_t x6, fp4_e2_t x7) {
+  fp4_e2_8_t result;
+  result.x = make_fp4_e2_4_t(x0, x1, x2, x3);
+  result.y = make_fp4_e2_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp4_e2_t values.
+TL_DEVICE fp4_e2_16_t make_fp4_e2_16_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                       fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                       fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t y0,
+                                       fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+                                       fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6,
+                                       fp4_e2_t y7) {
+  fp4_e2_16_t result;
+  result.x = make_fp4_e2_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp4_e2_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp4_e2_t values.
+TL_DEVICE fp4_e2_32_t make_fp4_e2_32_t(
+    fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2, fp4_e2_t x3, fp4_e2_t x4,
+    fp4_e2_t x5, fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t x8, fp4_e2_t x9,
+    fp4_e2_t x10, fp4_e2_t x11, fp4_e2_t x12, fp4_e2_t x13, fp4_e2_t x14,
+    fp4_e2_t x15, fp4_e2_t y0, fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+    fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6, fp4_e2_t y7, fp4_e2_t y8,
+    fp4_e2_t y9, fp4_e2_t y10, fp4_e2_t y11, fp4_e2_t y12, fp4_e2_t y13,
+    fp4_e2_t y14, fp4_e2_t y15) {
+  fp4_e2_32_t result;
+  result.x = make_fp4_e2_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp4_e2_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
+#endif
diff --git a/src/tl_templates/cuda/cuda_fp8.h b/src/tl_templates/cuda/cuda_fp8.h
index 2efb8f11153f43d1aff5b5451dfa54ce5ef395a0..d1774d47d31d7e7882f0510de04d7f114c35589b 100644
--- a/src/tl_templates/cuda/cuda_fp8.h
+++ b/src/tl_templates/cuda/cuda_fp8.h
@@ -6,6 +6,7 @@
 
 using fp8_e4_t = tl::float_e4m3_t;
 using fp8_e5_t = tl::float_e5m2_t;
+using fp8_e8_t = __nv_fp8_e8m0;
 
 struct __CUDA_ALIGN__(2) fp8_e4_2_t {
   fp8_e4_t x;
@@ -33,7 +34,7 @@ struct __CUDA_ALIGN__(32) fp8_e4_32_t {
   fp8_e4_16_t x;
   fp8_e4_16_t y;
 
-  __device__ __forceinline__ fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e4_8_t *)&rhs.x;
     x.y = *(fp8_e4_8_t *)&rhs.y;
     y.x = *(fp8_e4_8_t *)&rhs.z;
@@ -68,7 +69,7 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   fp8_e5_16_t x;
   fp8_e5_16_t y;
 
-  __device__ __forceinline__ fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e5_8_t *)&rhs.x;
     x.y = *(fp8_e5_8_t *)&rhs.y;
     y.x = *(fp8_e5_8_t *)&rhs.z;
@@ -77,8 +78,43 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   }
 };
 
+struct __CUDA_ALIGN__(2) fp8_e8_2_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+};
+
+struct __CUDA_ALIGN__(4) fp8_e8_4_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+  fp8_e8_t z;
+  fp8_e8_t w;
+};
+
+struct __CUDA_ALIGN__(8) fp8_e8_8_t {
+  fp8_e8_4_t x;
+  fp8_e8_4_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp8_e8_16_t {
+  fp8_e8_8_t x;
+  fp8_e8_8_t y;
+};
+
+struct __CUDA_ALIGN__(32) fp8_e8_32_t {
+  fp8_e8_16_t x;
+  fp8_e8_16_t y;
+
+  TL_DEVICE fp8_e8_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp8_e8_8_t *)&rhs.x;
+    x.y = *(fp8_e8_8_t *)&rhs.y;
+    y.x = *(fp8_e8_8_t *)&rhs.z;
+    y.y = *(fp8_e8_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
 // Pack two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
+TL_DEVICE fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
   fp8_e4_2_t result;
   result.x = x;
   result.y = y;
@@ -86,9 +122,8 @@ __forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
 }
 
 // Pack four fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2,
-                                                      fp8_e4_t x3) {
+TL_DEVICE fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3) {
   fp8_e4_4_t result;
   result.x = x0;
   result.y = x1;
@@ -98,11 +133,9 @@ __forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack eight fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2, fp8_e4_t x3,
-                                                      fp8_e4_t x4, fp8_e4_t x5,
-                                                      fp8_e4_t x6,
-                                                      fp8_e4_t x7) {
+TL_DEVICE fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                     fp8_e4_t x6, fp8_e4_t x7) {
   fp8_e4_8_t result;
   result.x = make_fp8_e4_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e4_4_t(x4, x5, x6, x7);
@@ -110,11 +143,12 @@ __forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack sixteen fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_16_t
-make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
-                 fp8_e4_t x4, fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7,
-                 fp8_e4_t y0, fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
-                 fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6, fp8_e4_t y7) {
+TL_DEVICE fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                       fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                       fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                       fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                       fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                       fp8_e4_t y7) {
   fp8_e4_16_t result;
   result.x = make_fp8_e4_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e4_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -122,7 +156,7 @@ make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
 }
 
 // Pack thirty-two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
+TL_DEVICE fp8_e4_32_t make_fp8_e4_32_t(
     fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3, fp8_e4_t x4,
     fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t x8, fp8_e4_t x9,
     fp8_e4_t x10, fp8_e4_t x11, fp8_e4_t x12, fp8_e4_t x13, fp8_e4_t x14,
@@ -139,7 +173,7 @@ __forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
 }
 
 // Pack two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
+TL_DEVICE fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
   fp8_e5_2_t result;
   result.x = x;
   result.y = y;
@@ -147,9 +181,8 @@ __forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
 }
 
 // Pack four fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2,
-                                                      fp8_e5_t x3) {
+TL_DEVICE fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3) {
   fp8_e5_4_t result;
   result.x = x0;
   result.y = x1;
@@ -159,11 +192,9 @@ __forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack eight fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2, fp8_e5_t x3,
-                                                      fp8_e5_t x4, fp8_e5_t x5,
-                                                      fp8_e5_t x6,
-                                                      fp8_e5_t x7) {
+TL_DEVICE fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                     fp8_e5_t x6, fp8_e5_t x7) {
   fp8_e5_8_t result;
   result.x = make_fp8_e5_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e5_4_t(x4, x5, x6, x7);
@@ -171,11 +202,12 @@ __forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack sixteen fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_16_t
-make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
-                 fp8_e5_t x4, fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7,
-                 fp8_e5_t y0, fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
-                 fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6, fp8_e5_t y7) {
+TL_DEVICE fp8_e5_16_t make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                       fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                       fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t y0,
+                                       fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
+                                       fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6,
+                                       fp8_e5_t y7) {
   fp8_e5_16_t result;
   result.x = make_fp8_e5_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e5_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -183,7 +215,7 @@ make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
 }
 
 // Pack thirty-two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
+TL_DEVICE fp8_e5_32_t make_fp8_e5_32_t(
     fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3, fp8_e5_t x4,
     fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t x8, fp8_e5_t x9,
     fp8_e5_t x10, fp8_e5_t x11, fp8_e5_t x12, fp8_e5_t x13, fp8_e5_t x14,
@@ -198,3 +230,73 @@ __forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
                               y12, y13, y14, y15);
   return result;
 }
+
+// Pack two fp8_e8_t values.
+TL_DEVICE fp8_e8_2_t make_fp8_e8_2_t(fp8_e8_t x, fp8_e8_t y) {
+  fp8_e8_2_t result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+
+// Pack four fp8_e8_t values.
+TL_DEVICE fp8_e8_4_t make_fp8_e8_4_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3) {
+  fp8_e8_4_t result;
+  result.x = x0;
+  result.y = x1;
+  result.z = x2;
+  result.w = x3;
+  return result;
+}
+
+// Pack eight fp8_e8_t values.
+TL_DEVICE fp8_e8_8_t make_fp8_e8_8_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                     fp8_e8_t x6, fp8_e8_t x7) {
+  fp8_e8_8_t result;
+  result.x = make_fp8_e8_4_t(x0, x1, x2, x3);
+  result.y = make_fp8_e8_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp8_e8_t values.
+TL_DEVICE fp8_e8_16_t make_fp8_e8_16_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                       fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                       fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t y0,
+                                       fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+                                       fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6,
+                                       fp8_e8_t y7) {
+  fp8_e8_16_t result;
+  result.x = make_fp8_e8_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp8_e8_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp8_e8_t values.
+TL_DEVICE fp8_e8_32_t make_fp8_e8_32_t(
+    fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2, fp8_e8_t x3, fp8_e8_t x4,
+    fp8_e8_t x5, fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t x8, fp8_e8_t x9,
+    fp8_e8_t x10, fp8_e8_t x11, fp8_e8_t x12, fp8_e8_t x13, fp8_e8_t x14,
+    fp8_e8_t x15, fp8_e8_t y0, fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+    fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6, fp8_e8_t y7, fp8_e8_t y8,
+    fp8_e8_t y9, fp8_e8_t y10, fp8_e8_t y11, fp8_e8_t y12, fp8_e8_t y13,
+    fp8_e8_t y14, fp8_e8_t y15) {
+  fp8_e8_32_t result;
+  result.x = make_fp8_e8_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp8_e8_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
+// e4m3x2 -> float2
+TL_DEVICE float2
+__tl_cvt_fp8x2_to_float2(const __nv_fp8x2_storage_t x,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+  half2 tmp = __nv_cvt_fp8x2_to_halfraw2(x, fp8_interpretation);
+  float2 result;
+  result.x = (float)tmp.x;
+  result.y = (float)tmp.y;
+  return result;
+}
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 7dbb31ea385de2bab579a35a0244252cdd9a593b..3f8ce5e6bc1373445a2a6d26b7c77aa669ddebe7 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -1,261 +1,121 @@
 #pragma once
 
+#if __CUDA_ARCH_LIST__ >= 890
 #include "./cuda_fp8.h"
-#include "common.h"
+#endif
 
+#include "common.h"
 #ifndef __CUDACC_RTC__
+#include <cstdint>
 #include <cstdio>
 #endif
 
-// Template declaration for device-side debug printing (variable only)
-template <typename T> __device__ void debug_print_var(const char *msg, T var);
-
-// Overload for pointer type (supports any cv-qualified T*)
-template <typename T> __device__ void debug_print_var(const char *msg, T *var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=pointer "
-      "value=%p\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, var);
-}
-
-// Specialization for signed char type
-template <>
-__device__ void debug_print_var<signed char>(const char *msg, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=signed "
-         "char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void debug_print_var<unsigned char>(const char *msg,
-                                               unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for integer type
-template <> __device__ void debug_print_var<int>(const char *msg, int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void debug_print_var<unsigned int>(const char *msg,
-                                              unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for float type
-template <> __device__ void debug_print_var<float>(const char *msg, float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for half type
-template <> __device__ void debug_print_var<half>(const char *msg, half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_var<half_t>(const char *msg, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half_t "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
+template <typename T> struct PrintTraits {
+  static __device__ void print_var(const char *msg, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (const void *)&val);
+  }
 
-// Specialization for bfloat16_t type
-template <>
-__device__ void debug_print_var<bfloat16_t>(const char *msg, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (const void *)&val);
+  }
+};
+
+#define DEFINE_PRINT_TRAIT(TYPE, NAME, FORMAT, CAST_TYPE)                      \
+  template <> struct PrintTraits<TYPE> {                                       \
+    static __device__ void print_var(const char *msg, TYPE val) {              \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "dtype=" NAME " value=" FORMAT "\n",                              \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, (CAST_TYPE)val);                        \
+    }                                                                          \
+    static __device__ void print_buffer(const char *msg, const char *buf_name, \
+                                        int index, TYPE val) {                 \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "buffer=%s, index=%d, dtype=" NAME " value=" FORMAT "\n",         \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, buf_name, index, (CAST_TYPE)val);       \
+    }                                                                          \
+  }
 
-// Specialization for double type
-template <>
-__device__ void debug_print_var<double>(const char *msg, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
-         "value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
+DEFINE_PRINT_TRAIT(char, "char", "%d", int);
+DEFINE_PRINT_TRAIT(signed char, "signed char", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned char, "unsigned char", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(short, "short", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned short, "unsigned short", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(int, "int", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned int, "uint", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(long, "long", "%ld", long);
+DEFINE_PRINT_TRAIT(unsigned long, "ulong", "%lu", unsigned long);
+DEFINE_PRINT_TRAIT(long long, "long long", "%lld", long long);
+
+DEFINE_PRINT_TRAIT(float, "float", "%f", float);
+DEFINE_PRINT_TRAIT(double, "double", "%lf", double);
+DEFINE_PRINT_TRAIT(half, "half", "%f", float);
+DEFINE_PRINT_TRAIT(half_t, "half_t", "%f", float);
+DEFINE_PRINT_TRAIT(bfloat16_t, "bfloat16_t", "%f", float);
+
+#if __CUDA_ARCH_LIST__ >= 890
+DEFINE_PRINT_TRAIT(fp8_e4_t, "fp8_e4_t", "%f", float);
+DEFINE_PRINT_TRAIT(fp8_e5_t, "fp8_e5_t", "%f", float);
+#endif
 
-// Specialization for fp8_e4_t type
-template <>
-__device__ void debug_print_var<fp8_e4_t>(const char *msg, fp8_e4_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e4_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
-}
+template <> struct PrintTraits<bool> {
+  static __device__ void print_var(const char *msg, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+           "value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, val ? "true" : "false");
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=bool value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, val ? "true" : "false");
+  }
+};
+
+template <typename T> struct PrintTraits<T *> {
+  static __device__ void print_var(const char *msg, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (void *)val);
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (void *)val);
+  }
+};
 
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_var<fp8_e5_t>(const char *msg, fp8_e5_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e5_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
+template <typename T> __device__ void debug_print_var(const char *msg, T var) {
+  PrintTraits<T>::print_var(msg, var);
 }
 
-// Template declaration for device-side debug printing (buffer only)
 template <typename T>
 __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
-                                         int index, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void
-debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
-                                      int index, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=signed char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
+                                         int index, T var) {
+  PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
 
-// Specialization for unsigned char type
 template <>
-__device__ void
-debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
-                                        int index, unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for integer type
-template <>
-__device__ void debug_print_buffer_value<int>(const char *msg,
-                                              const char *buf_name, int index,
-                                              int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned int>(const char *msg, const char *buf_name,
-                                       int index, unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for float type
-template <>
-__device__ void debug_print_buffer_value<float>(const char *msg,
-                                                const char *buf_name, int index,
-                                                float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=float value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for half type
-template <>
-__device__ void debug_print_buffer_value<half>(const char *msg,
-                                               const char *buf_name, int index,
-                                               half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_buffer_value<half_t>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for bfloat16_t type
-template <>
-__device__ void
-debug_print_buffer_value<bfloat16_t>(const char *msg, const char *buf_name,
-                                     int index, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_buffer_value<double>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=double value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for fp8_e4_t type
-template <>
-__device__ void debug_print_buffer_value<fp8_e4_t>(const char *msg,
+__device__ void debug_print_buffer_value<uint16_t>(const char *msg,
                                                    const char *buf_name,
-                                                   int index, fp8_e4_t var) {
+                                                   int index, uint16_t var) {
   printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e4_t value=%f\n",
+         "index=%d, dtype=uint16_t value=%u\n",
          msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_buffer_value<fp8_e5_t>(const char *msg,
-                                                   const char *buf_name,
-                                                   int index, fp8_e5_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e5_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for int16 type
-template <>
-__device__ void debug_print_buffer_value<int16_t>(const char *msg,
-                                                  const char *buf_name,
-                                                  int index, int16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int16_t value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (int32_t)var);
+         threadIdx.z, buf_name, index, (uint32_t)var);
 }
 
 TL_DEVICE void device_assert(bool cond) { assert(cond); }
@@ -265,4 +125,4 @@ TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) {
     printf("Device assert failed: %s\n", msg);
     assert(0);
   }
-}
+}
\ No newline at end of file
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
index 7128317325bca38de71cd0e2fdbdb3ee96617412..25841a3b6d40801ddbdefbc9f5a9178ec2f9d648 100644
--- a/src/tl_templates/cuda/gemm_mma.h
+++ b/src/tl_templates/cuda/gemm_mma.h
@@ -8,7 +8,6 @@
 #include <cute/underscore.hpp>
 
 #include "common.h"
-#include "cuda_fp8.h"
 #include "intrin.h"
 
 namespace cute::tl_mma {
diff --git a/src/tl_templates/cuda/gemm_sm100.h b/src/tl_templates/cuda/gemm_sm100.h
index 856d37dd13f68f6cf95ab346f406c51e03c9eff7..84e22f24e1f11804c192233f5971f87f9998f4db 100644
--- a/src/tl_templates/cuda/gemm_sm100.h
+++ b/src/tl_templates/cuda/gemm_sm100.h
@@ -243,47 +243,99 @@ struct DispatchInstruction<half_t, half_t, float, M, N, K, a_major, b_major,
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
-                         integral_constant<UMMA::Major, b_major>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
 template <int M, int N, int K, int AtomM, int AtomN, int AtomK, bool trans_A,
           bool trans_B, typename A_type_raw, typename B_type_raw,
           typename C_type_raw>
diff --git a/src/tl_templates/cuda/instruction/mma.h b/src/tl_templates/cuda/instruction/mma.h
index ed561285f723846ce0592ddc9f3189f729dd110e..869fa777bc1cdb72a62a5645c135c5393e308d96 100644
--- a/src/tl_templates/cuda/instruction/mma.h
+++ b/src/tl_templates/cuda/instruction/mma.h
@@ -4,8 +4,10 @@
 #include <cute/arch/mma_sm80.hpp>
 #include <cute/arch/mma_sm89.hpp>
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/instruction/mma_sm70.h b/src/tl_templates/cuda/instruction/mma_sm70.h
index 656741752fcc06b019d396a158fe8e7413b0f28e..7a44b92124d7d8d454c8bbaa1693fdf1ef1f64d1 100644
--- a/src/tl_templates/cuda/instruction/mma_sm70.h
+++ b/src/tl_templates/cuda/instruction/mma_sm70.h
@@ -2,8 +2,10 @@
 
 #include "../common.h"
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/instruction/wgmma.h b/src/tl_templates/cuda/instruction/wgmma.h
index b5ef59c26b3651833cfd16ec405e9f159e636bbc..3af2d79fe0c49941d2f993e1faa0e3b31fb30d07 100644
--- a/src/tl_templates/cuda/instruction/wgmma.h
+++ b/src/tl_templates/cuda/instruction/wgmma.h
@@ -4,8 +4,10 @@
 #include <cute/arch/mma_sm90_gmma.hpp>
 #include <cute/arch/mma_sm90_gmma_ext.hpp>
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/nvrtc_std.h b/src/tl_templates/cuda/nvrtc_std.h
index 9930c2200365af92734aa181eeaa95f75a8c2221..34cd58bb29994dbb1a50e7e9a1539bbfcc290380 100644
--- a/src/tl_templates/cuda/nvrtc_std.h
+++ b/src/tl_templates/cuda/nvrtc_std.h
@@ -19,6 +19,11 @@
 
 #ifdef __CUDACC_RTC__
 
+// Disable problematic CUDA standard library headers in NVRTC environment
+// Vector types (float4, uchar, etc.) are built-in to NVRTC and don't need these
+// headers
+#define _LIBCUDACXX___TUPLE_VECTOR_TYPES_H // Prevent vector_types.h inclusion
+
 using int8_t = signed char;
 using uint8_t = unsigned char;
 using int16_t = signed short;
@@ -67,6 +72,24 @@ template <class T> struct is_same<T, T> : true_type {};
 template <class T, class U>
 inline constexpr bool is_same_v = is_same<T, U>::value;
 
+template <class T> struct is_void : false_type {};
+
+template <> struct is_void<void> : true_type {};
+template <> struct is_void<const void> : true_type {};
+template <> struct is_void<volatile void> : true_type {};
+template <> struct is_void<const volatile void> : true_type {};
+
+template <class T> inline constexpr bool is_void_v = is_void<T>::value;
+
+template <class T> struct is_pointer : false_type {};
+
+template <class T> struct is_pointer<T *> : true_type {};
+template <class T> struct is_pointer<T *const> : true_type {};
+template <class T> struct is_pointer<T *volatile> : true_type {};
+template <class T> struct is_pointer<T *const volatile> : true_type {};
+
+template <class T> inline constexpr bool is_pointer_v = is_pointer<T>::value;
+
 namespace index_sequence_impl {
 
 // Based on https://stackoverflow.com/a/32223343/11717224
@@ -118,6 +141,36 @@ template <bool B, class T = void> struct enable_if {};
 template <class T> struct enable_if<true, T> {
   using type = T;
 };
+
+template <class T> struct remove_extent {
+  using type = T;
+};
+
+template <class T> struct remove_extent<T[]> {
+  using type = T;
+};
+
+template <class T, size_t N> struct remove_extent<T[N]> {
+  using type = T;
+};
+
+template <class T> using remove_extent_t = typename remove_extent<T>::type;
+
+template <class T, unsigned I = 0>
+struct extent : integral_constant<size_t, 0> {};
+
+template <class T> struct extent<T[], 0> : integral_constant<size_t, 0> {};
+
+template <class T, unsigned I> struct extent<T[], I> : extent<T, I - 1> {};
+
+template <class T, size_t N>
+struct extent<T[N], 0> : integral_constant<size_t, N> {};
+
+template <class T, size_t N, unsigned I>
+struct extent<T[N], I> : extent<T, I - 1> {};
+
+template <class T, unsigned I = 0>
+inline constexpr size_t extent_v = extent<T, I>::value;
 } // namespace std
 
-#endif
\ No newline at end of file
+#endif // __CUDACC_RTC__
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
index 0009b9b99293372081ecd3d1f319cbec48274574..4582426493b5cbf1bc22605b7d571549b0e692fc 100644
--- a/src/tl_templates/cuda/reduce.h
+++ b/src/tl_templates/cuda/reduce.h
@@ -1,8 +1,11 @@
 #pragma once
 
 #include "common.h"
+
+#ifndef __CUDACC_RTC__
 #include <cstdint>
 #include <type_traits>
+#endif
 
 namespace tl {
 
@@ -247,4 +250,35 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
   }
 };
 
+template <typename T, typename ReduceOp>
+TL_DEVICE T warp_reduce(T value, ReduceOp op) {
+  constexpr uint32_t mask = 0xffffffff;
+  value = op(value, __shfl_xor_sync(mask, value, 16));
+  value = op(value, __shfl_xor_sync(mask, value, 8));
+  value = op(value, __shfl_xor_sync(mask, value, 4));
+  value = op(value, __shfl_xor_sync(mask, value, 2));
+  value = op(value, __shfl_xor_sync(mask, value, 1));
+  return value;
+}
+
+template <typename T> TL_DEVICE T warp_reduce_sum(T value) {
+  return warp_reduce<T>(value, SumOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_max(T value) {
+  return warp_reduce<T>(value, MaxOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_min(T value) {
+  return warp_reduce<T>(value, MinOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitand(T value) {
+  return warp_reduce<T>(value, BitAndOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitor(T value) {
+  return warp_reduce<T>(value, BitOrOp());
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/tcgen_05_ld.h b/src/tl_templates/cuda/tcgen_05_ld.h
index b2eb2f816032e3d99f8f7efea4659aabbaabecc1..9e5e34206a8139fd17616f10cf6d94272916b399 100644
--- a/src/tl_templates/cuda/tcgen_05_ld.h
+++ b/src/tl_templates/cuda/tcgen_05_ld.h
@@ -10,7 +10,9 @@
 namespace tl {
 
 // 32 data path lanes, 32-bit pattern, repeated N times
-class tmem_ld_32dp32bNx {
+template <bool Pack16> class tmem_ld_32dp32bNx;
+
+template <> class tmem_ld_32dp32bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -180,9 +182,180 @@ public:
     }
   }
 };
+template <> class tmem_ld_32dp32bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x2.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x8.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x128.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
 
 // 16 data path lanes, 64-bit pattern, repeated N times
-class tmem_ld_16dp64bNx {
+template <bool Pack16> class tmem_ld_16dp64bNx;
+template <> class tmem_ld_16dp64bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -352,39 +525,43 @@ public:
     }
   }
 };
-
-// 16 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_16dp128bNx {
+template <> class tmem_ld_16dp64bNx<true> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
-                  "N must be a power of 2 and lies between 1 ~ 64");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x2.b32"
                    "{%0, %1},"
                    "[%2];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x4.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x8.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -395,9 +572,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -414,9 +591,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x64.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -449,9 +626,9 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 64) {
+    } else if constexpr (N == 128) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x128.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -519,32 +696,39 @@ public:
   }
 };
 
-// 16 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_16dp256bNx {
+// 16 data path lanes, 128-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp128bNx;
+template <> class tmem_ld_16dp128bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
-                  "N must be a power of 2 and lies between 1 ~ 32");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
+    } else if constexpr (N == 8) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -555,9 +739,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -574,9 +758,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -609,9 +793,492 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp128bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 256-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp256bNx;
+template <> class tmem_ld_16dp256bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp256bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -681,32 +1348,32 @@ public:
 
 // 32 data path lanes, 64-bit pattern, repeated N times
 // (conducted with 2x16dp64bNx)
-class tmem_ld_32dp64bNx {
+template <bool Pack16 = false> class tmem_ld_32dp64bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp64bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp64bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N);
   }
 };
 
 // 32 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_32dp128bNx {
+template <bool Pack16 = false> class tmem_ld_32dp128bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp128bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp128bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
   }
 };
 
 // 32 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_32dp256bNx {
+template <bool Pack16 = false> class tmem_ld_32dp256bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp256bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp256bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
   }
 };
 
diff --git a/src/tl_templates/hip/common.h b/src/tl_templates/hip/common.h
index b00944a1829dad5ab9f8d53fcef7db0efded63de..8be247e77e85a53eb5affaedf1178c6e1a95bb4e 100644
--- a/src/tl_templates/hip/common.h
+++ b/src/tl_templates/hip/common.h
@@ -116,6 +116,7 @@ TL_DEVICE void AtomicAdd(T1 address, T2 val) {
   atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
 }
 
-template <typename T1, typename T2> TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val) {
-  return atomicAdd(&ref, static_cast<T1>(val));
+template <typename T1, typename T2>
+TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val) {
+  return atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
 }
diff --git a/src/tl_templates/hip/copy.h b/src/tl_templates/hip/copy.h
index 3ba334da88599516a9f6b98da2e5888bd92e5b86..3f122d801f80b77cd21826e839033d6acdf11f34 100644
--- a/src/tl_templates/hip/copy.h
+++ b/src/tl_templates/hip/copy.h
@@ -73,33 +73,35 @@ CK_TILE_DEVICE void async_buffer_load_dword_v(void *smem, int32x4_t rsrc,
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void *lds_base_ptr, void *global_base_ptr) {
+TL_DEVICE void cp_async_gs(void *lds_base_ptr, void const *global_base_ptr) {
   if constexpr (N == 16) {
-    *(uint4 *)lds_base_ptr = *(uint4 *)global_base_ptr;
+    *(uint4 *)lds_base_ptr = *(const uint4 *)global_base_ptr;
   } else if constexpr (N == 8) {
-    *(uint2 *)lds_base_ptr = *(uint2 *)global_base_ptr;
+    *(uint2 *)lds_base_ptr = *(const uint2 *)global_base_ptr;
   } else if constexpr (N == 4) {
     async_buffer_load_dword_v(
         lds_base_ptr,
-        make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+        make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                  threadIdx.x),
         threadIdx.x * N /*assume 4 bytes*/);
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void *lds_base_ptr,
-                                       void *global_base_ptr, bool cond) {
+                                       void const *global_base_ptr, bool cond) {
   if constexpr (N == 16) {
     *(uint4 *)lds_base_ptr =
-        cond ? *(uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
+        cond ? *(const uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
   } else if constexpr (N == 8) {
     *(uint2 *)lds_base_ptr =
-        cond ? *(uint2 *)global_base_ptr : make_uint2(0, 0);
+        cond ? *(const uint2 *)global_base_ptr : make_uint2(0, 0);
   } else {
     if (cond) {
       async_buffer_load_dword_v(
           lds_base_ptr,
-          make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+          make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                    threadIdx.x),
           threadIdx.x * N /*assume 4 bytes*/);
     } else {
       *(uint4 *)lds_base_ptr = make_uint4(0, 0, 0, 0);
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
index 0000745b5b7b66dedacd46c95c3cb8ef23384cc2..b32f84dca22a9358a882e7ecd3a1367b9e144ca2 100644
--- a/src/tl_templates/hip/hip_fp8.h
+++ b/src/tl_templates/hip/hip_fp8.h
@@ -127,3 +127,41 @@ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x, fp8_e4_t y, fp8_e4_t z,
   res.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
   return res;
 }
+
+__device__ fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                        fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                        fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                        fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                        fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                        fp8_e4_t y7) {
+  signed char x0_char = *reinterpret_cast<signed char *>(&x0);
+  signed char x1_char = *reinterpret_cast<signed char *>(&x1);
+  signed char x2_char = *reinterpret_cast<signed char *>(&x2);
+  signed char x3_char = *reinterpret_cast<signed char *>(&x3);
+  signed char x4_char = *reinterpret_cast<signed char *>(&x4);
+  signed char x5_char = *reinterpret_cast<signed char *>(&x5);
+  signed char x6_char = *reinterpret_cast<signed char *>(&x6);
+  signed char x7_char = *reinterpret_cast<signed char *>(&x7);
+  signed char y0_char = *reinterpret_cast<signed char *>(&y0);
+  signed char y1_char = *reinterpret_cast<signed char *>(&y1);
+  signed char y2_char = *reinterpret_cast<signed char *>(&y2);
+  signed char y3_char = *reinterpret_cast<signed char *>(&y3);
+  signed char y4_char = *reinterpret_cast<signed char *>(&y4);
+  signed char y5_char = *reinterpret_cast<signed char *>(&y5);
+  signed char y6_char = *reinterpret_cast<signed char *>(&y6);
+  signed char y7_char = *reinterpret_cast<signed char *>(&y7);
+  int a = (x3_char << 24) | (x2_char << 16) | (x1_char << 8) | x0_char;
+  int b = (x7_char << 24) | (x6_char << 16) | (x5_char << 8) | x4_char;
+  int c = (y3_char << 24) | (y2_char << 16) | (y1_char << 8) | y0_char;
+  int d = (y7_char << 24) | (y6_char << 16) | (y5_char << 8) | y4_char;
+  fp8_e4_8_t res_x;
+  res_x.x = *reinterpret_cast<fp8_e4_4_t *>(&a);
+  res_x.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
+  fp8_e4_8_t res_y;
+  res_y.x = *reinterpret_cast<fp8_e4_4_t *>(&c);
+  res_y.y = *reinterpret_cast<fp8_e4_4_t *>(&d);
+  fp8_e4_16_t res;
+  res.x = res_x;
+  res.y = res_y;
+  return res;
+}
\ No newline at end of file
diff --git a/src/transform/annotate_read_only_params.cc b/src/transform/annotate_read_only_params.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9eef683b5e36581f40080760d59b8fea34e3b4b
--- /dev/null
+++ b/src/transform/annotate_read_only_params.cc
@@ -0,0 +1,191 @@
+/*!
+ * \file annotate_read_only_params.cc
+ * \brief Annotate PrimFunc parameters that are read-only (never written).
+ */
+
+#include <string>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <unordered_set>
+
+namespace tvm {
+namespace tl {
+using namespace tir;
+using namespace ffi;
+
+/*!
+ * \brief A simple visitor that marks handle parameters as written when they
+ *        appear on the LHS of a BufferStore or in a tvm_access_ptr with write
+ * flag.
+ */
+class ReadWriteMarker : public StmtExprVisitor {
+public:
+  explicit ReadWriteMarker(
+      const std::unordered_set<const VarNode *> &param_or_data_vars)
+      : param_or_data_vars_(param_or_data_vars) {}
+
+  const std::unordered_set<const VarNode *> &written() const {
+    return written_;
+  }
+
+  // Try to resolve the underlying buffer data Var from a pointer-like
+  // argument. Supports:
+  //  - address_of(BufferLoad(...)) -> returns buffer->data
+  //  - BufferLoad(...)             -> returns buffer->data
+  // Otherwise returns nullptr.
+  const VarNode *ResolveDataVarFromPtrArg(const PrimExpr &arg) const {
+    if (const auto *call = arg.as<CallNode>()) {
+      if (call->op.same_as(builtin::address_of())) {
+        if (call->args.size() == 1U) {
+          if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+            return load->buffer->data.get();
+          }
+        }
+      }
+    } else if (const auto *load = arg.as<BufferLoadNode>()) {
+      return load->buffer->data.get();
+    }
+    return nullptr;
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    const VarNode *data = op->buffer->data.get();
+    if (param_or_data_vars_.count(data)) {
+      written_.insert(data);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Detect tvm_access_ptr writes. Be conservative if rw_mask is non-constant.
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      if (op->args.size() == 5U) {
+        if (const VarNode *buf = op->args[1].as<VarNode>()) {
+          const IntImmNode *flag = op->args[4].as<IntImmNode>();
+          bool maybe_write = true; // default conservative
+          if (flag) {
+            maybe_write = (flag->value & 2) != 0; // write bit set
+          }
+          if (maybe_write && param_or_data_vars_.count(buf)) {
+            written_.insert(buf);
+          }
+        }
+      }
+    } else {
+      // Generic fallback: mark buffers that appear as
+      // address_of(BufferLoad(...)) in call arguments as written. This matches
+      // patterns like
+      //   tl.tma_store(address_of(smem[..]), address_of(gmem[..]), ...)
+      //   call_extern("AtomicAdd*", address_of(gmem[..]), ...)
+      // and avoids over-marking plain BufferLoad used for reads.
+      for (const PrimExpr &a : op->args) {
+        if (const auto *c = a.as<CallNode>()) {
+          if (c->op.same_as(builtin::address_of()) && c->args.size() == 1U) {
+            if (const auto *bl = c->args[0].as<BufferLoadNode>()) {
+              const VarNode *data = bl->buffer->data.get();
+              if (param_or_data_vars_.count(data)) {
+                written_.insert(data);
+              }
+            }
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+private:
+  std::unordered_set<const VarNode *> param_or_data_vars_;
+  std::unordered_set<const VarNode *> written_;
+};
+
+/*!
+ * \brief Annotate PrimFunc with indices of read-only handle parameters.
+ *
+ * Adds an Array<Integer> attribute "tl.readonly_param_indices" that lists
+ * parameter indices which correspond to handle parameters that are never
+ * written inside the function body. This can be used by codegen to emit
+ * `const` qualifiers to enable read-only caching (e.g., __ldg on CUDA).
+ */
+static tir::PrimFunc MarkReadOnlyParams(tir::PrimFunc f) {
+  // Gather handle params and their corresponding buffer data vars (aliases).
+  std::unordered_set<const VarNode *> param_or_data_vars;
+  // Map back from data var to parameter index for result attribution.
+  std::unordered_map<const VarNode *, size_t> data_var_to_param_idx;
+
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &p = f->params[i];
+    if (!p->dtype.is_handle())
+      continue;
+    param_or_data_vars.insert(p.get());
+    // If there is a buffer_map entry for this param, include its data var too.
+    if (auto opt = f->buffer_map.Get(p)) {
+      const VarNode *data = opt.value()->data.get();
+      param_or_data_vars.insert(data);
+      data_var_to_param_idx[data] = i;
+    }
+  }
+  if (param_or_data_vars.empty())
+    return f;
+
+  ReadWriteMarker marker(param_or_data_vars);
+  marker(f->body);
+
+  // Determine read-only parameter indices among all params (handle only)
+  Array<Integer> readonly_indices;
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &v = f->params[i];
+    if (!v->dtype.is_handle())
+      continue;
+
+    bool is_written = false;
+    // Direct param var written?
+    if (marker.written().count(v.get())) {
+      is_written = true;
+    } else {
+      // Or any aliased data var written?
+      if (auto opt = f->buffer_map.Get(v)) {
+        if (marker.written().count(opt.value()->data.get())) {
+          is_written = true;
+        }
+      }
+    }
+
+    if (!is_written) {
+      readonly_indices.push_back(Integer(static_cast<int>(i)));
+    }
+  }
+
+  if (!readonly_indices.empty()) {
+    Map<String, Any> attrs;
+    attrs.Set(String("tl.readonly_param_indices"), readonly_indices);
+    f = WithAttrs(std::move(f), attrs);
+  }
+  return f;
+}
+
+namespace transform {
+using namespace tir::transform;
+
+Pass AnnotateReadOnlyParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &m,
+                      const tvm::transform::PassContext &ctx) {
+    return MarkReadOnlyParams(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.AnnotateReadOnlyParams", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.AnnotateReadOnlyParams",
+                        AnnotateReadOnlyParams);
+}
+
+} // namespace transform
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 7df6d0cc8af767217898149237cfa571a518ea3d..294c9f6bc6cafc8c9eb3deb5fc152fa371492853 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file arg_binder.cc
  * \brief Helper utility to match and bind arguments.
@@ -24,13 +5,21 @@
 #include "arg_binder.h"
 
 #include <tvm/runtime/device_api.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
 #include <sstream>
+#include <unordered_set>
 
+#include "../runtime/error_helpers.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/arith/int_solver.h"
+#include "tvm/ffi/cast.h"
+#include "tvm/ffi/container/array.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/stmt_functor.h"
 
 namespace tvm {
 namespace tl {
@@ -38,17 +27,157 @@ namespace tl {
 using namespace tir;
 
 void BinderAddAssert(arith::Analyzer *ana, PrimExpr cond,
-                     const std::string &arg_name, std::vector<Stmt> *asserts) {
+                     const std::string &arg_name, std::vector<Stmt> *asserts,
+                     PrimExpr nullable_guard = PrimExpr()) {
   PrimExpr scond = ana->Simplify(cond);
   if (is_zero(scond)) {
     LOG(FATAL) << "Bind have an unmet assertion: " << cond << ", "
                << " on argument " << arg_name;
   }
+
   if (!is_one(scond)) {
-    std::ostringstream os;
-    os << "Argument " << arg_name << " has an unsatisfied constraint: " << cond;
-    asserts->emplace_back(AssertStmt(scond, StringImm(os.str()), Evaluate(0)));
+    // Extract kernel/buffer/field from arg_name (e.g., "main.A.shape[0]")
+    std::string kernel = arg_name;
+    std::string buf_and_field = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel = arg_name.substr(0, dot_pos);
+      buf_and_field = arg_name.substr(dot_pos + 1);
+    }
+    std::string buffer = buf_and_field;
+    std::string field;
+    size_t dot2 = buf_and_field.find('.');
+    if (dot2 != std::string::npos) {
+      buffer = buf_and_field.substr(0, dot2);
+      field = buf_and_field.substr(dot2 + 1);
+    }
+
+    // If cond is an equality, prefer structured packed error with expect/got
+    if (const auto *eq = scond.as<tvm::tir::EQNode>()) {
+      PrimExpr lhs = eq->a;
+      PrimExpr rhs = eq->b;
+      // Choose rhs as expected and lhs as got for better semantics in most
+      // binding cases
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_expect_eq));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      pargs.push_back(cast(DataType::Int(64), rhs)); // expected
+      pargs.push_back(cast(DataType::Int(64), lhs)); // got
+
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      // Only emit at runtime when the equality fails
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
+    } else {
+      // Fallback: packed generic constraint violation without dumping cond
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_constraint_violation));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
+    }
+  }
+}
+
+std::vector<Var> ArgBinder::getUndefVars(const std::vector<PrimExpr> &args) {
+  std::unordered_set<const VarNode *> visit;
+  std::vector<Var> res;
+  for (const auto &arg : args) {
+    PostOrderVisit(arg, [&](ObjectRef r) {
+      if (auto var = r.as<VarNode>()) {
+        if (!visit.count(var)) {
+          visit.insert(var);
+        }
+        auto it = def_map_->find(var);
+        if (it == def_map_->end()) {
+          // res.push_back(var);
+          res.push_back(ffi::GetRef<Var>(var));
+        }
+      }
+    });
   }
+  return res;
+}
+
+bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                             const std::string &arg_name, bool with_lets,
+                             const PrimExpr &nullable_guard) {
+  // Currently only used in BindDLTensor, nullable_guard is already a defined
+  // bool, so use it directly.
+  auto MakeGuarded = [&](PrimExpr basic) -> PrimExpr {
+    // is_null || basic
+    return Or(nullable_guard, basic);
+  };
+  ICHECK_EQ(arg.dtype(), value.dtype()) << "arg " << arg << " value " << value;
+  auto BindVar = [&](const VarNode *v, PrimExpr value) {
+    auto v_arg = ffi::GetRef<Var>(v);
+    defs_.emplace_back(v_arg);
+    if (with_lets) {
+      (*def_map_)[v] = value;
+      init_nest_.emplace_back(LetStmt(v_arg, value, Evaluate(0)));
+    } else {
+      (*def_map_)[v] = value;
+    }
+  };
+  // 1. simple binding var = value
+  if (const VarNode *v = arg.as<VarNode>()) {
+    auto it = def_map_->find(v);
+    if (it == def_map_->end()) {
+      BindVar(v, value);
+      // First time binding: identical behavior as Bind_
+      return true;
+    } else {
+      // Second or later binding: add is_null short-circuit
+      PrimExpr cond = value == it->second;
+      BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
+    }
+  } else {
+    // 2. complex binding expr = value
+    //  get undefined variables
+    auto undefs = ffi::Array<Var>(getUndefVars({arg}));
+    if (!undefs.empty()) {
+      // if value is not integer, such as float, we are unable to solve it
+      if (!value.dtype().is_int() && !value.dtype().is_uint()) {
+        LOG(FATAL) << "Unable to solve non-integer variables " << undefs
+                   << " from equation `" << value << "`";
+      }
+      arith::IntConstraints constraints(undefs, {}, {arg == value});
+      auto sol = arith::SolveLinearEquations(constraints);
+      if (!sol->dst->variables.empty()) {
+        LOG(FATAL) << "TVM is unable to solve variables " << undefs
+                   << " from equation " << constraints;
+      }
+      for (const auto &v : undefs) {
+        auto value_opt = sol->src_to_dst.Get(v);
+        ICHECK(value_opt->defined())
+            << "Unable to solve variable `" << v << "` from expression `"
+            << (value == arg) << "`";
+        auto value = ffi::GetRef<PrimExpr>(sol->src_to_dst.Get(v)->get());
+        BindVar(v.as<VarNode>(), value);
+      }
+    }
+    // we must add the assert again
+    //    because the solved expression may contain floordiv (e.g. 3 * m == n
+    //    ==>   m = n // 3) we re-compute the constraint to verify the solution
+    //    is correct
+    PrimExpr cond = value == arg;
+    BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
+  }
+  // ICHECK(false);
+  return false;
 }
 
 bool ArgBinder::Bind_(const PrimExpr &arg, const PrimExpr &value,
@@ -67,10 +196,10 @@ bool ArgBinder::Bind_(const PrimExpr &arg, const PrimExpr &value,
       }
       return true;
     } else {
-      BinderAddAssert(&analyzer_, it->second == value, arg_name, &asserts_);
+      BinderAddAssert(&analyzer_, value == it->second, arg_name, &asserts_);
     }
   } else {
-    BinderAddAssert(&analyzer_, arg == value, arg_name, &asserts_);
+    BinderAddAssert(&analyzer_, value == arg, arg_name, &asserts_);
   }
   return false;
 }
@@ -96,8 +225,30 @@ void ArgBinder::BindBuffer(const Buffer &arg, const Buffer &value,
                            const std::string &arg_name, bool fuzzy_match) {
   ICHECK_EQ(arg.scope(), value.scope())
       << "Argument " << arg_name << " Buffer bind scope mismatch";
-  ICHECK_EQ(arg->dtype, value->dtype)
-      << "Argument " << arg_name << " Buffer bind data type mismatch";
+  // Relax dtype check to allow FP8 E4M3 variants to bind together.
+  auto dtype_compatible = [](DataType expected, DataType provided) -> bool {
+    if (expected == provided)
+      return true;
+    // If expected is float8_e4m3, allow float8_e4m3fn/float8_e4m3fnuz as well.
+    if (expected.is_float8_e4m3()) {
+      return provided.is_float8_e4m3() || provided.is_float8_e4m3fn() ||
+             provided.is_float8_e4m3fnuz();
+    }
+    // If expected is float8_e5m2, allow float8_e5m2fnuz as well.
+    if (expected.is_float8_e5m2()) {
+      return provided.is_float8_e5m2() || provided.is_float8_e5m2fnuz();
+    }
+    // If expected is bool, allow binding from int8/uint8 with same lanes.
+    if (expected.is_bool()) {
+      bool is_i8 = provided.is_int() && provided.bits() == 8;
+      bool is_u8 = provided.is_uint() && provided.bits() == 8;
+      return (is_i8 || is_u8) && expected.lanes() == provided.lanes();
+    }
+    return false;
+  };
+  ICHECK(dtype_compatible(arg->dtype, value->dtype))
+      << "Argument " << arg_name << " Buffer bind data type mismatch: expected "
+      << arg->dtype << ", got " << value->dtype;
   if (value->data_alignment % arg->data_alignment != 0) {
     LOG(WARNING) << "Trying to bind buffer to another one with lower alignment "
                     "requirement "
@@ -121,7 +272,7 @@ void ArgBinder::BindBuffer(const Buffer &arg, const Buffer &value,
         PrimExpr offset = value->elem_offset;
         PrimExpr factor = make_const(offset.dtype(), arg->offset_factor);
         PrimExpr zero = make_zero(offset.dtype());
-        BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero,
+        BinderAddAssert(&analyzer_, zero == truncmod(offset, factor),
                         arg_name + ".elem_offset", &asserts_);
       }
     }
@@ -160,217 +311,618 @@ inline PrimExpr TVMArrayGet(DataType t, Var arr,
   return TVMStructGet(t, arr, 0, kind);
 }
 
-void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
-                             const PrimExpr &device_id, const Var &handle,
-                             const std::string &arg_name) {
+void ArgBinder::BindDLTensors(
+    const std::vector<std::pair<Var, Buffer>> &buffer_def,
+    const PrimExpr &device_type, const PrimExpr &device_id,
+    const std::string &func_name,
+    const std::unordered_set<const VarNode *> &used_param_buffers) {
+  ffi::Array<Buffer> buffers;
+  ffi::Array<Var> handles;
+
+  // First pass: collect shape var -> list of (buffer_name, dim_idx, handle_ptr)
+  struct ShapeVarSource {
+    std::string buf_name;
+    size_t dim_idx;
+    const VarNode *handle_ptr; // Raw pointer to check used_param_buffers
+  };
+  std::unordered_map<const VarNode *, std::vector<ShapeVarSource>>
+      shape_var_sources;
+
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+
+    // Scan buffer shape for symbolic variables
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      if (buffer->dtype == DataType::Int(4) ||
+          buffer->dtype == DataType::UInt(4) ||
+          buffer->dtype == DataType::Int(1)) {
+        break;
+      }
+
+      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+        // This dimension is a symbolic variable
+        shape_var_sources[v].push_back({arg_name, k, handle.get()});
+      }
+    }
+  }
+
+  // Second pass: Create is_null vars and shape buffers for all buffers first
+  std::unordered_map<std::string, Var> is_null_map;
+  std::unordered_map<std::string, Buffer> shape_buffer_map;
+  std::unordered_map<std::string, PrimExpr>
+      is_null_expr_map; // arg_name -> is_null expression (const_false for used
+                        // buffers)
+
   const DataType tvm_shape_type = DataType::ShapeIndex();
   const DataType tvm_ndim_type = DataType::Int(32);
   const Stmt nop = Evaluate(0);
 
-  init_nest_.emplace_back(AssertStmt(
-      !Call(DataType::Bool(), builtin::isnullptr(), {handle}),
-      StringImm(arg_name + " is expected to have non-NULL DLTensor* pointer"),
-      nop));
-
-  // dimension checks
-  PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
-
-  // Helper functions for shape/stride name formatting
-  auto shape_handle_name = [&]() { return arg_name + ".shape"; };
-  auto stride_handle_name = [&]() { return arg_name + ".strides"; };
-  auto array_element_name = [&](const std::string &arr_name, size_t k) {
-    std::stringstream ss;
-    ss << arr_name << '[' << k << ']';
-    return ss.str();
-  };
-  auto shape_element_name = [&](size_t k) {
-    return array_element_name(shape_handle_name(), k);
-  };
-  auto stride_element_name = [&](size_t k) {
-    return array_element_name(stride_handle_name(), k);
-  };
+  // Create all is_null vars and shape buffers first
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
 
-  PrimExpr a_ndim =
-      make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
-  std::ostringstream ndim_err_msg;
-  ndim_err_msg << arg_name << ".ndim is expected to equal "
-               << buffer->shape.size();
-  auto msg = StringImm(ndim_err_msg.str());
-  init_nest_.emplace_back(AssertStmt(a_ndim == v_ndim, msg, nop));
-  // type checks
-  std::ostringstream type_err_msg;
-  type_err_msg << arg_name << ".dtype is expected to be " << buffer->dtype;
-  PrimExpr cond =
-      (TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode) ==
-           IntImm(DataType::UInt(8), buffer->dtype.code()) &&
-       TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits) ==
-           IntImm(DataType::UInt(8), buffer->dtype.bits()) &&
-       TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes) ==
-           IntImm(DataType::UInt(16), buffer->dtype.lanes()));
-  if (!(buffer->dtype == DataType::Int(1) ||
-        buffer->dtype == DataType::Int(4) ||
-        buffer->dtype == DataType::UInt(4))) {
-    auto type_msg = StringImm(type_err_msg.str());
-    asserts_.emplace_back(AssertStmt(cond, type_msg, nop));
-  }
+    Var is_null_var(arg_name + "_is_null", DataType::Bool());
+    init_nest_.emplace_back(
+        LetStmt(is_null_var,
+                Call(DataType::Bool(), builtin::isnullptr(), {handle}), nop));
+    const PrimExpr &is_null = is_used ? const_false() : is_null_var;
 
-  // shape field
-  Buffer buf_shape =
-      decl_buffer({IntImm(DataType::Int(32), buffer->shape.size())},
-                  tvm_shape_type, shape_handle_name());
-  Var v_shape(shape_handle_name(), DataType::Handle());
-  def_handle_dtype_.Set(v_shape, make_const(tvm_shape_type, 0));
-  init_nest_.emplace_back(LetStmt(
-      buf_shape->data,
-      TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape), nop));
-  init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
-  for (size_t k = 0; k < buffer->shape.size(); ++k) {
-    if (buffer->dtype == DataType::Int(4) ||
-        buffer->dtype == DataType::UInt(4) ||
-        buffer->dtype == DataType::Int(1)) {
-      break;
+    is_null_map[arg_name] = is_null_var;
+    is_null_expr_map[arg_name] = is_null;
+
+    if (is_used) {
+      init_nest_.emplace_back(
+          AssertStmt(!is_null_var,
+                     tvm::tir::StringImm(
+                         arg_name + " is expected to have non-NULL pointer"),
+                     nop));
     }
-    Bind_(buffer->shape[k],
-          cast(buffer->shape[k].dtype(),
-               BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)})),
-          shape_element_name(k), true);
   }
-  // strides field
-  Buffer buf_strides =
-      decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
-                  tvm_shape_type, arg_name + ".strides");
-  def_handle_dtype_.Set(buf_strides->data, tir::TypeAnnotation(tvm_shape_type));
-  init_nest_.emplace_back(LetStmt(
-      buf_strides->data,
-      TVMArrayGet(DataType::Handle(), handle, builtin::kArrStrides), nop));
-  init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
-  PrimExpr v_strides_is_null =
-      Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
-  if (buffer->strides.empty()) {
-    // Assert the buffer is compact
-    DataType stype = buffer->DefaultIndexType();
-    PrimExpr expect_stride = make_const(stype, 1);
-    ffi::Array<PrimExpr> conds;
-    for (size_t i = buffer->shape.size(); i != 0; --i) {
-      size_t k = i - 1;
-      PrimExpr svalue =
-          cast(stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
-      conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
-      expect_stride = expect_stride * buffer->shape[k];
+
+  // Create all shape buffers before binding any shapes
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+
+    // shape field
+    Buffer buf_shape =
+        decl_buffer({IntImm(DataType::Int(32), buffer->shape.size())},
+                    tvm_shape_type, shape_handle_name());
+    def_handle_dtype_.Set(buf_shape->data, make_const(tvm_shape_type, 0));
+    // Use if_then_else for NULL guard on the shape pointer itself, avoiding
+    // dereferencing TVMStructGet(handle, kArrShape) when handle is NULL.
+    init_nest_.emplace_back(
+        LetStmt(buf_shape->data,
+                tvm::if_then_else(
+                    Not(is_null),
+                    TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape),
+                    make_zero(DataType::Handle())),
+                nop));
+    init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
+
+    // Save for later use in shape binding
+    shape_buffer_map[arg_name] = buf_shape;
+  }
+
+  // Now process each buffer fully
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // dimension checks
+    PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+    auto stride_handle_name = [&]() { return arg_name + ".strides"; };
+    auto array_element_name = [&](const std::string &arr_name, size_t k) {
+      std::stringstream ss;
+      ss << arr_name << '[' << k << ']';
+      return ss.str();
+    };
+    auto shape_element_name = [&](size_t k) {
+      return array_element_name(shape_handle_name(), k);
+    };
+    auto stride_element_name = [&](size_t k) {
+      return array_element_name(stride_handle_name(), k);
+    };
+
+    PrimExpr a_ndim =
+        make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
+    // Build clearer ndim message with kernel/buffer names
+    std::string kernel_nm = arg_name;
+    std::string buf_nm = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel_nm = arg_name.substr(0, dot_pos);
+      buf_nm = arg_name.substr(dot_pos + 1);
     }
-    std::ostringstream stride_err_msg;
-    stride_err_msg << stride_handle_name() << ": expected to be compact array";
-    if (!conds.empty()) {
-      auto stride_msg = StringImm(stride_err_msg.str());
-      Stmt check =
-          AssertStmt(foldl([](PrimExpr a, PrimExpr b,
-                              Span span) { return logical_and(a, b, span); },
-                           const_true(1), conds),
-                     stride_msg, Evaluate(0));
-      check = IfThenElse(Not(v_strides_is_null), check);
-      asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
+    // Only check ndim when handle is non-NULL: use packed error helper
+    PrimExpr ndim_ok = (a_ndim == v_ndim);
+    ffi::Array<PrimExpr> ndim_args;
+    ndim_args.push_back(StringImm(tvm_error_ndim_mismatch));
+    ndim_args.push_back(StringImm(kernel_nm));
+    ndim_args.push_back(StringImm(buf_nm));
+    ndim_args.push_back(cast(DataType::Int(64), a_ndim));
+    ndim_args.push_back(cast(DataType::Int(64), v_ndim));
+    Stmt ndim_call = Evaluate(
+        Call(DataType::Int(32), builtin::tvm_call_packed(), ndim_args));
+    init_nest_.emplace_back(
+        SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ndim_ok), ndim_call),
+                            Evaluate(0)),
+                 nop}));
+    // type checks
+    // Guard all dtype field loads by `is_null` using if_then_else
+    PrimExpr v_type_code = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode),
+        IntImm(DataType::UInt(8), buffer->dtype.code()));
+    PrimExpr v_type_bits = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits),
+        IntImm(DataType::UInt(8), buffer->dtype.bits()));
+    PrimExpr v_type_lanes = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes),
+        IntImm(DataType::UInt(16), buffer->dtype.lanes()));
+    PrimExpr expect_code = IntImm(DataType::UInt(8), buffer->dtype.code());
+    PrimExpr expect_bits = IntImm(DataType::UInt(8), buffer->dtype.bits());
+    PrimExpr expect_lanes = IntImm(DataType::UInt(16), buffer->dtype.lanes());
+
+    PrimExpr cond = (v_type_code == expect_code && v_type_bits == expect_bits &&
+                     v_type_lanes == expect_lanes);
+
+    // Allow float8_e4m3 to match float8_e4m3fn/float8_e4m3fnuz at runtime.
+    if (buffer->dtype.is_float8_e4m3()) {
+      PrimExpr code_e4m3 = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3);
+      PrimExpr code_e4m3fn =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fn);
+      PrimExpr code_e4m3fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e4m3 || v_type_code == code_e4m3fn ||
+           v_type_code == code_e4m3fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
     }
-  } else if (buffer->buffer_type == kAutoBroadcast) {
-    PrimExpr stride_from_shape = make_const(buffer->DefaultIndexType(), 1);
-    for (size_t i = buffer->shape.size(); i != 0; --i) {
-      size_t k = i - 1;
-      DataType stride_dtype = buffer->strides[k].dtype();
-      PrimExpr explicit_stride =
-          cast(stride_dtype,
-               BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
-      PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
-      PrimExpr value = tvm::if_then_else(
-          v_strides_is_null, stride_from_shape_cast, explicit_stride);
-      value = tvm::if_then_else(buffer->shape[k] == 1, make_zero(stride_dtype),
-                                value);
-      Bind_(buffer->strides[k], value, stride_element_name(k), true);
-      PrimExpr shape_extent = cast(stride_dtype, buffer->shape[k]);
-      stride_from_shape =
-          analyzer_.Simplify(stride_from_shape_cast * shape_extent);
+    // Allow float8_e5m2 to match float8_e5m2fnuz at runtime.
+    if (buffer->dtype.is_float8_e5m2()) {
+      PrimExpr code_e5m2 = IntImm(DataType::UInt(8), DataType::kFloat8_e5m2);
+      PrimExpr code_e5m2fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e5m2fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e5m2 || v_type_code == code_e5m2fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
     }
-  } else {
-    PrimExpr stride_from_shape = make_const(buffer->DefaultIndexType(), 1);
-
-    for (int k = buffer->strides.size() - 1; k >= 0; k--) {
-      DataType stride_dtype = buffer->strides[k].dtype();
-      PrimExpr explicit_stride =
-          cast(stride_dtype,
-               BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
-      PrimExpr shape_stride = cast(
-          stride_dtype, BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
-      PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
-
-      Bind_(buffer->strides[k],
-            tvm::if_then_else(v_strides_is_null, stride_from_shape_cast,
-                              explicit_stride),
-            stride_element_name(k), true);
-
-      stride_from_shape =
-          analyzer_.Simplify(stride_from_shape_cast * shape_stride);
+    // Allow bool to match int8/uint8 at runtime, and also kDLBool(code=6).
+    if (buffer->dtype.is_bool()) {
+      PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+      PrimExpr code_uint = IntImm(DataType::UInt(8), DataType::kUInt);
+      PrimExpr code_kdlbool = IntImm(DataType::UInt(8), 6);
+      PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+      PrimExpr bits1 = IntImm(DataType::UInt(8), 1);
+      PrimExpr lanes_ok = (v_type_lanes == expect_lanes);
+      PrimExpr int8_ok =
+          (v_type_code == code_int && v_type_bits == bits8 && lanes_ok);
+      PrimExpr uint8_ok =
+          (v_type_code == code_uint && v_type_bits == bits8 && lanes_ok);
+      // Some frontends may tag bool tensors as kDLBool(code=6), commonly with
+      // bits=8 or bits=1.
+      PrimExpr kdlbool8_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits8 && lanes_ok);
+      PrimExpr kdlbool1_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits1 && lanes_ok);
+      // Also accept any dtype whose bitwidth=1, regardless of code, to be
+      // defensive.
+      PrimExpr bit1_ok = (v_type_bits == bits1 && lanes_ok);
+      cond =
+          cond || int8_ok || uint8_ok || kdlbool8_ok || kdlbool1_ok || bit1_ok;
+    }
+    // Allow float4 to match int8 at runtime (PyTorch uses int8 as storage for
+    // FP4).
+    if (buffer->dtype.is_float4()) {
+      PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+      PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+      // For FP4, we pack 2 elements per byte, but we still use same lanes at
+      // storage level Accept int8 with same lanes as the fp4 type
+      PrimExpr fp4_lanes_ok = (v_type_lanes == expect_lanes);
+      PrimExpr int8_ok =
+          (v_type_code == code_int && v_type_bits == bits8 && fp4_lanes_ok);
+      cond = cond || int8_ok;
+    }
+    if (!(buffer->dtype == DataType::Int(1) ||
+          buffer->dtype == DataType::Int(4) ||
+          buffer->dtype == DataType::UInt(4) || buffer->dtype.is_float4())) {
+      // Build FFI packed call to __tvm_error_dtype_mismatch when mismatch
+      // occurs. Only issue the call when handle is non-NULL and cond is false.
+      ffi::Array<PrimExpr> packed_args;
+      packed_args.push_back(StringImm(tvm_error_dtype_mismatch));
+      // Split arg_name of the form "<kernel>.<buffer>" into parts for clearer
+      // diagnostics
+      std::string kernel_name = arg_name;
+      std::string buffer_name = arg_name;
+      size_t dot_pos = arg_name.find('.');
+      if (dot_pos != std::string::npos) {
+        kernel_name = arg_name.substr(0, dot_pos);
+        buffer_name = arg_name.substr(dot_pos + 1);
+      }
+      packed_args.push_back(StringImm(kernel_name));
+      packed_args.push_back(StringImm(buffer_name));
+
+      auto i64 = DataType::Int(64);
+      // Cast to int64 for FFI function signature
+      packed_args.push_back(cast(i64, v_type_code));  // actual_code
+      packed_args.push_back(cast(i64, v_type_bits));  // actual_bits
+      packed_args.push_back(cast(i64, v_type_lanes)); // actual_lanes
+      packed_args.push_back(cast(i64, expect_code));  // expect_code
+      packed_args.push_back(cast(i64, expect_bits));  // expect_bits
+      packed_args.push_back(cast(i64, expect_lanes)); // expect_lanes
+
+      Stmt call_err = Evaluate(
+          Call(DataType::Int(32), builtin::tvm_call_packed(), packed_args));
+      // Guard the call: only when handle is not null and cond fails
+      Stmt guarded = IfThenElse(Not(is_null) && Not(cond), call_err);
+      asserts_.emplace_back(SeqStmt({guarded, nop}));
+    }
+
+    // Get the pre-created shape buffer
+    Buffer buf_shape = shape_buffer_map[arg_name];
+
+    // Bind symbolic variables from buffer shape
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      // These packed-bit dtype shapes were not bound in the original
+      // implementation, so we just use them as is.
+      if (buffer->dtype == DataType::Int(4) ||
+          buffer->dtype == DataType::UInt(4) ||
+          buffer->dtype == DataType::Int(1)) {
+        break;
+      }
+
+      // The "real" runtime shape value read from DLTensor
+      PrimExpr shape_val =
+          cast(buffer->shape[k].dtype(),
+               BufferLoad(buf_shape,
+                          {IntImm(DataType::Int(32), static_cast<int>(k))}));
+
+      // Check if this dimension is a symbolic variable
+      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+        auto it = def_map_->find(v);
+        if (it == def_map_->end()) {
+          // First time binding this symbolic variable
+          auto sources_it = shape_var_sources.find(v);
+          if (sources_it != shape_var_sources.end() &&
+              sources_it->second.size() > 1) {
+            // This variable appears in multiple buffers
+            // Assert that at least one buffer is non-null
+            PrimExpr any_nonnull = const_false();
+            for (const auto &src : sources_it->second) {
+              bool buf_is_used = used_param_buffers.count(src.handle_ptr);
+              if (buf_is_used) {
+                any_nonnull = const_true();
+                break;
+              }
+              Var src_is_null = is_null_map[src.buf_name];
+              any_nonnull = Or(any_nonnull, Not(src_is_null));
+            }
+
+            std::ostringstream err_msg;
+            err_msg << "Symbolic shape variable "
+                    << ffi::GetRef<Var>(v)->name_hint
+                    << " requires at least one non-null buffer among: ";
+            bool first = true;
+            for (const auto &src : sources_it->second) {
+              if (!first)
+                err_msg << ", ";
+              err_msg << src.buf_name;
+              first = false;
+            }
+
+            init_nest_.emplace_back(AssertStmt(
+                any_nonnull, tvm::tir::StringImm(err_msg.str()), nop));
+
+            // Build cascaded if_then_else: if !is_null_a then a.shape[k] else
+            // if !is_null_b then b.shape[k] ... We need to construct this in
+            // reverse order
+            PrimExpr cascaded_value;
+            bool is_first_source = true;
+
+            for (auto rit = sources_it->second.rbegin();
+                 rit != sources_it->second.rend(); ++rit) {
+              const auto &src = *rit;
+
+              // Get the shape buffer for this source
+              auto it_buf = shape_buffer_map.find(src.buf_name);
+              if (it_buf == shape_buffer_map.end()) {
+                LOG(FATAL) << "Shape buffer not found for " << src.buf_name;
+              }
+              Buffer src_shape_buf = it_buf->second;
+
+              // Construct the shape load
+              PrimExpr src_shape_val =
+                  cast(buffer->shape[k].dtype(),
+                       BufferLoad(src_shape_buf,
+                                  {IntImm(DataType::Int(32),
+                                          static_cast<int>(src.dim_idx))}));
+
+              // Check if this buffer is used (non-nullable)
+              bool src_is_used = used_param_buffers.count(src.handle_ptr);
+
+              if (is_first_source) {
+                // Base case: use this shape value directly (we know at least
+                // one is non-null from assert)
+                cascaded_value = src_shape_val;
+                is_first_source = false;
+              } else {
+                // if !is_null then use this shape, else use previous cascaded
+                // value But if buffer is used (non-nullable), always use its
+                // shape
+                if (src_is_used) {
+                  cascaded_value = src_shape_val;
+                } else {
+                  Var src_is_null = is_null_map[src.buf_name];
+                  cascaded_value = tvm::if_then_else(
+                      Not(src_is_null), src_shape_val, cascaded_value);
+                }
+              }
+            }
+
+            // Bind the variable to the cascaded expression
+            Var v_arg = ffi::GetRef<Var>(v);
+            defs_.emplace_back(v_arg);
+            (*def_map_)[v] = cascaded_value;
+            init_nest_.emplace_back(
+                LetStmt(v_arg, cascaded_value, Evaluate(0)));
+          } else {
+            // Single source or no special handling needed, use the original
+            // nullable binding
+            BindNullable(buffer->shape[k], shape_val, shape_element_name(k),
+                         true, is_null);
+          }
+        } else {
+          // Variable already bound, add assertion with nullable guard
+          PrimExpr cond = (it->second == shape_val);
+          BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
+                          is_null);
+        }
+      } else {
+        // Constant dimension, just add assertion
+        BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
+                     is_null);
+      }
     }
-  }
-  // Byte_offset field.
-  int data_bytes = GetVectorBytes(buffer->dtype);
 
-  if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
-    Bind_(make_const(DataType::UInt(64), const_offset->value * data_bytes),
+    // strides field
+    Buffer buf_strides =
+        decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
+                    tvm_shape_type, arg_name + ".strides");
+    def_handle_dtype_.Set(buf_strides->data,
+                          tir::TypeAnnotation(tvm_shape_type));
+    init_nest_.emplace_back(
+        LetStmt(buf_strides->data,
+                tvm::if_then_else(Not(is_null),
+                                  TVMArrayGet(DataType::Handle(), handle,
+                                              builtin::kArrStrides),
+                                  make_zero(DataType::Handle())),
+                nop));
+    init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
+    PrimExpr v_strides_is_null =
+        Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
+
+    if (buffer->strides.empty()) {
+      // Assert the buffer is compact
+      DataType stype = buffer->DefaultIndexType();
+      PrimExpr expect_stride = make_const(stype, 1);
+      ffi::Array<PrimExpr> conds;
+      for (size_t i = buffer->shape.size(); i != 0; --i) {
+        size_t k = i - 1;
+        PrimExpr svalue =
+            cast(stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32),
+                                                        static_cast<int>(k))}));
+        conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
+        expect_stride = expect_stride * buffer->shape[k];
+      }
+      std::ostringstream stride_err_msg;
+      stride_err_msg
+          << stride_handle_name()
+          << ": expected to be compact array, but got non-compact strides";
+      if (!conds.empty()) {
+        PrimExpr all_ok =
+            foldl([](PrimExpr a, PrimExpr b,
+                     Span span) { return logical_and(a, b, span); },
+                  const_true(1), conds);
+        // Packed generic violation for non-compact strides
+        std::string kernel_nm3 = arg_name;
+        std::string buf_nm3 = arg_name;
+        size_t dot_pos3 = arg_name.find('.');
+        if (dot_pos3 != std::string::npos) {
+          kernel_nm3 = arg_name.substr(0, dot_pos3);
+          buf_nm3 = arg_name.substr(dot_pos3 + 1);
+        }
+        ffi::Array<PrimExpr> pargs4;
+        pargs4.push_back(StringImm(tvm_error_constraint_violation));
+        pargs4.push_back(StringImm(kernel_nm3));
+        pargs4.push_back(StringImm(buf_nm3));
+        pargs4.push_back(StringImm("strides"));
+        Stmt call_err4 = Evaluate(
+            Call(DataType::Int(32), builtin::tvm_call_packed(), pargs4));
+        // Only check when strides array is present and condition fails
+        Stmt check =
+            IfThenElse(Not(v_strides_is_null),
+                       IfThenElse(Not(all_ok), call_err4), Evaluate(0));
+        asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
+      }
+    } else if (buffer->buffer_type == kAutoBroadcast) {
+      PrimExpr stride_from_shape = 1;
+      for (size_t i = buffer->shape.size(); i != 0; --i) {
+        size_t k = i - 1;
+        DataType stride_dtype = buffer->strides[k].dtype();
+        PrimExpr explicit_stride =
+            cast(stride_dtype,
+                 BufferLoad(buf_strides,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
+
+        PrimExpr stride_val = tvm::if_then_else(
+            v_strides_is_null, stride_from_shape, explicit_stride);
+
+        BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
+                     true, is_null);
+      }
+    } else {
+      PrimExpr stride_from_shape = 1;
+
+      for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0; --k) {
+        DataType stride_dtype = buffer->strides[k].dtype();
+        PrimExpr explicit_stride =
+            cast(stride_dtype,
+                 BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+        PrimExpr shape_stride =
+            cast(stride_dtype,
+                 BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
+
+        PrimExpr stride_val = tvm::if_then_else(
+            v_strides_is_null, stride_from_shape, explicit_stride);
+
+        BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
+                     true, is_null);
+      }
+    }
+
+    // Byte_offset field.
+    int data_bytes = GetVectorBytes(buffer->dtype);
+
+    if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
+      // Constant elem_offset: only need consistency check, no need for
+      // additional Var binding.
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
           TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
-          arg_name + ".byte_offset", true);
-  } else {
-    if (Bind_(buffer->elem_offset,
-              cast(buffer->elem_offset.dtype(),
-                   (TVMArrayGet(DataType::UInt(64), handle,
-                                builtin::kArrByteOffset) /
-                    make_const(DataType::UInt(64), data_bytes))),
-              arg_name + ".elem_offset", true)) {
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_byte_offset =
+          make_const(DataType::UInt(64), const_offset->value * data_bytes);
+      PrimExpr ok = (expect_byte_offset == actual_byte_offset);
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_byte_offset_mismatch));
+      pargs.push_back(StringImm(kernel_nm));
+      pargs.push_back(StringImm(buf_nm));
+      pargs.push_back(cast(DataType::Int(64), expect_byte_offset));
+      pargs.push_back(cast(DataType::Int(64), actual_byte_offset));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err), Evaluate(0)),
+           nop}));
+    } else {
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_elem_off = cast(
+          buffer->elem_offset.dtype(),
+          (actual_byte_offset / make_const(DataType::UInt(64), data_bytes)));
+
+      BindNullable(buffer->elem_offset, expect_elem_off,
+                   arg_name + ".elem_offset", true, is_null);
+
       if (buffer->offset_factor > 1) {
         PrimExpr offset = buffer->elem_offset;
         PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
         PrimExpr zero = make_zero(offset.dtype());
-        BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero,
-                        arg_name + ".elem_offset", &asserts_);
+        BindNullable(offset, truncmod(offset, factor),
+                     arg_name + ".elem_offset", true, is_null);
       }
     }
-  }
-  // device info.
-  Bind_(device_type,
+
+    // device info.
+    // Define device_id from handle when available (so later passes can use it)
+    PrimExpr actual_dev_type = tvm::if_then_else(
+        Not(is_null),
         TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceType),
-        arg_name + ".device_type", true);
-  Bind_(device_id,
+        make_zero(DataType::Int(32)));
+    PrimExpr actual_dev_id = tvm::if_then_else(
+        Not(is_null),
         TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
-        arg_name + ".device_id", true);
-
-  // Data field.  Because the validation of the data field may depend
-  // on a dynamic size defined by the other DLTensor* parameters, this
-  // field must be generated last.
-  if (Bind_(buffer->data,
-            TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
-            arg_name + ".data", true)) {
-    Var vptr(buffer->data);
-
-    // Check if the data pointer is NULL.  This check is skipped for
-    // size-0 arrays, since CUDA provides a NULL pointer for size-zero
-    // allocations.
-    auto alloc_size = [&]() -> PrimExpr {
-      PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
+        make_zero(DataType::Int(32)));
+
+    // Bind device_id to a safe expression (0 when NULL handle)
+    BindNullable(device_id, actual_dev_id, arg_name + ".device_id", true,
+                 is_null);
+    // Check device_type consistency (device_id equality is implicitly ensured
+    // by binding above)
+    {
+      PrimExpr ok = (device_type == actual_dev_type);
+      ffi::Array<PrimExpr> pargs2;
+      pargs2.push_back(StringImm(tvm_error_device_type_mismatch));
+      pargs2.push_back(StringImm(kernel_nm));
+      pargs2.push_back(StringImm(buf_nm));
+      pargs2.push_back(cast(DataType::Int(64), device_type));
+      pargs2.push_back(cast(DataType::Int(64), actual_dev_type));
+      Stmt call_err2 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs2));
+      asserts_.emplace_back(
+          SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err2),
+                              Evaluate(0)),
+                   Evaluate(0)}));
+    }
+
+    // Data field.  Because the validation of the data field may depend
+    // on a dynamic size defined by the other DLTensor* parameters, this
+    // field must be generated last.
+    // Bind data pointer using expression-level guard to avoid deref on NULL.
+    {
+      Var vptr(buffer->data);
+      PrimExpr data_ptr = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
+          make_zero(DataType::Handle()));
+      BindNullable(buffer->data, data_ptr, arg_name + ".data", true, is_null);
+
+      // Check if the data pointer is NULL.  This check is skipped for
+      // size-0 arrays and also skipped when handle itself is NULL.
+      PrimExpr alloc_size = IntImm(buffer->DefaultIndexType(), 1);
       for (const auto &dim : buffer->shape) {
-        product *= dim;
+        alloc_size = alloc_size * dim;
       }
-      return product;
-    }();
-    asserts_.emplace_back(AssertStmt(
-        alloc_size == 0 ||
-            !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
-        StringImm(arg_name + " is expected to have non-NULL data pointer"),
-        nop));
-
-    def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
-    // mark alignment of external bufs
-    init_nest_.emplace_back(
-        AttrStmt(vptr, tir::attr::storage_alignment,
-                 IntImm(DataType::Int(32), buffer->data_alignment), nop));
+      // Improve message: kernel/buffer naming for data pointer null check
+      std::string kernel_nm2 = arg_name;
+      std::string buf_nm2 = arg_name;
+      size_t dot_pos2 = arg_name.find('.');
+      if (dot_pos2 != std::string::npos) {
+        kernel_nm2 = arg_name.substr(0, dot_pos2);
+        buf_nm2 = arg_name.substr(dot_pos2 + 1);
+      }
+      // expand combined condition via nested IfThenElse for portability
+      ffi::Array<PrimExpr> pargs3;
+      pargs3.push_back(StringImm(tvm_error_null_ptr));
+      pargs3.push_back(StringImm(kernel_nm2));
+      pargs3.push_back(StringImm(buf_nm2));
+      pargs3.push_back(StringImm("data pointer"));
+      Stmt call_err3 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs3));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null),
+                      IfThenElse(Not(alloc_size == 0),
+                                 IfThenElse(Call(DataType::Bool(),
+                                                 builtin::isnullptr(), {vptr}),
+                                            call_err3),
+                                 Evaluate(0)),
+                      Evaluate(0)),
+           nop}));
+
+      // mark alignment of external bufs
+      init_nest_.emplace_back(
+          AttrStmt(vptr, tir::attr::storage_alignment,
+                   IntImm(DataType::Int(32), buffer->data_alignment), nop));
+
+      def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
+    }
   }
 }
 
 } // namespace tl
-} // namespace tvm
+} // namespace tvm
\ No newline at end of file
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index d04e7e9b28470aa6308e941913c37fa2608ca9e3..bb7a0f46fd452dfe6cb44fa100ad9f94035a8719 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -95,17 +95,21 @@ public:
    */
   void BindBuffer(const Buffer &arg, const Buffer &value,
                   const std::string &arg_name, bool fuzzy_match);
+
   /*!
    * \brief Bind symbolic buffer to a DLTensor handle.
    * \param buffer The argument buffer to be binded.
-   * \param device_type The device id to be binded.
+   * \param device_type The device type to be binded.
    * \param device_id The device id to be binded.
-   * \param handle The DLTensor handle.
-   * \param arg_name argument name.
+   * \param buffer_def The buffer definition.
+   * \param func_name The function name.
+   * \param used_param_buffers The used param buffers.
    */
-  void BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
-                    const PrimExpr &device_id, const Var &handle,
-                    const std::string &arg_name);
+  void
+  BindDLTensors(const std::vector<std::pair<Var, Buffer>> &buffer_def,
+                const PrimExpr &device_type, const PrimExpr &device_id,
+                const std::string &func_name,
+                const std::unordered_set<const VarNode *> &used_param_buffers);
 
   /*! \return The defs generated in binding. */
   const std::vector<Var> &defs() const { return defs_; }
@@ -154,7 +158,12 @@ public:
     return def_handle_dtype_;
   }
 
+  bool BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                    const std::string &arg_name, bool with_lets,
+                    const PrimExpr &nullable_guard);
+
 private:
+  std::vector<Var> getUndefVars(const std::vector<PrimExpr> &arg);
   // Internal bind function
   bool Bind_(const PrimExpr &arg, const PrimExpr &value,
              const std::string &arg_name, bool with_lets);
@@ -173,4 +182,4 @@ private:
 };
 } // namespace tl
 } // namespace tvm
-#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_
+#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_
\ No newline at end of file
diff --git a/src/transform/atomicadd_vectorize.cc b/src/transform/atomicadd_vectorize.cc
index 40cb8140208218babd8baadbd30e1b2c2d82fb2e..d66a538dbe3560dc8ffbbf414b1da8c3886c266f 100644
--- a/src/transform/atomicadd_vectorize.cc
+++ b/src/transform/atomicadd_vectorize.cc
@@ -203,7 +203,8 @@ private:
         vmap.Set(old_var, new_var * vector_size_);
         Stmt body = Substitute(fnode->body, vmap);
         return For(new_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
+                   fnode->thread_binding, fnode->annotations, fnode->step,
+                   fnode->span);
       }
     }
     return ret;
@@ -245,8 +246,9 @@ private:
         new_args.push_back(address_of_dst);
         new_args.push_back(address_of_value);
       } else {
+        // Scalar case: AtomicAdd now expects a pointer to destination.
         new_args.push_back(StringImm("AtomicAdd"));
-        new_args.push_back(dst_node);
+        new_args.push_back(address_of_dst);
         new_args.push_back(value_node);
       }
       new_args.push_back(memory_order);
@@ -258,8 +260,28 @@ private:
     } else {
       Array<PrimExpr> new_args;
       new_args.push_back(StringImm("AtomicAdd"));
-      for (auto x : node->args)
-        new_args.push_back(x);
+      // Ensure first argument is an address; keep value as-is.
+      if (!node->args.empty()) {
+        if (const auto *bl = node->args[0].as<BufferLoadNode>()) {
+          Call address_of_dst = Call(DataType::Handle(), builtin::address_of(),
+                                     {Downcast<BufferLoad>(node->args[0])});
+          new_args.push_back(address_of_dst);
+        } else if (const auto *call = node->args[0].as<CallNode>()) {
+          // If it's already an address_of, forward it; otherwise, keep
+          // original.
+          if (call->op.same_as(builtin::address_of())) {
+            new_args.push_back(node->args[0]);
+          } else {
+            new_args.push_back(node->args[0]);
+          }
+        } else {
+          new_args.push_back(node->args[0]);
+        }
+        // Push remaining args unchanged (value, optional memory_order, ...)
+        for (size_t i = 1; i < node->args.size(); ++i) {
+          new_args.push_back(node->args[i]);
+        }
+      }
 
       Call new_call =
           tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h
index a55bc0f4a7a69504342b1bd4f51dc17fe788c603..627dc895f403c27c233061f1cfc9d20461801eca 100644
--- a/src/transform/atomicadd_vectorize.h
+++ b/src/transform/atomicadd_vectorize.h
@@ -11,7 +11,6 @@
 #include "../op/builtin.h"
 #include "arith/int_operator.h"
 #include "arith/ir_visitor_with_analyzer.h"
-#include "atomicadd_vectorize.h"
 #include "common/loop_vectorization_utils.h"
 #include <numeric>
 #include <tvm/arith/analyzer.h>
diff --git a/src/transform/common/assume.cc b/src/transform/common/assume.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb51d0f8a043db73f638572720bd7b71e15d79cb
--- /dev/null
+++ b/src/transform/common/assume.cc
@@ -0,0 +1,33 @@
+
+/*!
+ * \file assume.cc
+ * \brief Utils on assume statements
+ */
+
+#include "assume.h"
+#include "tvm/tir/builtin.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt) {
+  auto eval = stmt.as<EvaluateNode>();
+  if (!eval)
+    return std::nullopt;
+  auto call = eval->value.as<CallNode>();
+  if (!call)
+    return std::nullopt;
+  if (!call->op.same_as(builtin::assume()))
+    return std::nullopt;
+  return call->args[0];
+}
+
+bool IsAssumeInEvaluateForm(const Stmt &stmt) {
+  return GetAssumeExprInEvaluateForm(stmt).has_value();
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/common/assume.h b/src/transform/common/assume.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6eadc6b341ef18d8062412350c52e007551cbee
--- /dev/null
+++ b/src/transform/common/assume.h
@@ -0,0 +1,28 @@
+
+/*!
+ * \file assume.h
+ * \brief Utils on assume statements
+ */
+
+#ifndef TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+#define TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+
+#include "tvm/tir/stmt.h"
+#include <optional>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Get the expression inside an assume statement, if any. Returns nullopt if
+// the statement is not an assume statement.
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt);
+
+// Check if a statement is an assume statement.
+bool IsAssumeInEvaluateForm(const Stmt &stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_
\ No newline at end of file
diff --git a/src/transform/hoist_nonrestrict_params.cc b/src/transform/hoist_nonrestrict_params.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90db747e88a6cad9c73107be8b2f7168d89d2e50
--- /dev/null
+++ b/src/transform/hoist_nonrestrict_params.cc
@@ -0,0 +1,133 @@
+/*
+ * Hoist tl.non_restrict_params block annotation(s) to PrimFunc attribute.
+ *
+ * Previously, we only looked at the root block. This version recursively
+ * scans all blocks, unions any tl.non_restrict_params entries it finds,
+ * merges with any existing PrimFunc-level attribute, then writes the
+ * deduplicated result back to the PrimFunc attrs. This makes annotation
+ * placement within the function body flexible for frontends.
+ */
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+
+namespace tvm {
+namespace tl {
+using namespace tvm::tir;
+
+class NonRestrictCollector : public StmtVisitor {
+public:
+  void Collect(const Stmt &stmt) { VisitStmt(stmt); }
+
+  Array<Var> Result() const {
+    Array<Var> out;
+    out.reserve(collected_.size());
+    for (const Var &v : collected_)
+      out.push_back(v);
+    return out;
+  }
+
+private:
+  static std::string NormalizeName(const std::string &s) {
+    if (s.size() >= 8 && s.rfind("_handle") == s.size() - 7) {
+      return s.substr(0, s.size() - 7);
+    }
+    return s;
+  }
+
+  void MaybeInsert(const Var &v) {
+    if (!v.defined())
+      return;
+    const VarNode *p = v.get();
+    if (seen_ptr_.count(p))
+      return;
+    // Also dedup by normalized name to be robust w.r.t recreated Vars
+    std::string norm = NormalizeName(v->name_hint);
+    if (seen_name_.count(norm))
+      return;
+    seen_ptr_.insert(p);
+    seen_name_.insert(std::move(norm));
+    collected_.push_back(v);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    auto it = op->annotations.find(attr::kNonRestrictParams);
+    if (it != op->annotations.end()) {
+      if (const auto *arr = (*it).second.as<ffi::ArrayObj>()) {
+        // Downcast directly to Array<Var> for convenience
+        Array<Var> vars = tvm::Downcast<Array<Var>>((*it).second);
+        for (const Var &v : vars) {
+          MaybeInsert(v);
+        }
+      }
+    }
+    // Recurse into child statements
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  std::vector<Var> collected_;
+  std::unordered_set<const VarNode *> seen_ptr_;
+  std::unordered_set<std::string> seen_name_;
+};
+
+static PrimFunc HoistNonRestrictParams(PrimFunc f) {
+  if (!f.defined())
+    return f;
+
+  NonRestrictCollector collector;
+  collector.Collect(f->body);
+  Array<Var> from_blocks = collector.Result();
+
+  // Merge with any existing PrimFunc-level attribute if present
+  if (auto opt_existing = f->GetAttr<Array<Var>>(attr::kNonRestrictParams)) {
+    for (const Var &v : opt_existing.value()) {
+      // Reuse the collector's dedup logic by temporarily constructing a new
+      // collector Alternatively, do a small inline dedup mirroring MaybeInsert
+      // Here we inline a simplified pointer-based dedup plus name-based
+      // fallback
+      bool exists = false;
+      for (const Var &cur : from_blocks) {
+        if (cur.get() == v.get() || cur->name_hint == v->name_hint) {
+          exists = true;
+          break;
+        }
+      }
+      if (!exists)
+        from_blocks.push_back(v);
+    }
+  }
+
+  if (from_blocks.empty())
+    return f;
+
+  return WithAttr(std::move(f), attr::kNonRestrictParams,
+                  std::move(from_blocks));
+}
+
+namespace transform {
+
+tvm::transform::Pass HoistNonRestrictParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &,
+                      const tvm::transform::PassContext &) {
+    return tvm::tl::HoistNonRestrictParams(std::move(f));
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(
+      pass_func, 0, "tl.HoistNonRestrictParams", {});
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.HoistNonRestrictParams",
+                        tvm::tl::transform::HoistNonRestrictParams);
+}
diff --git a/src/transform/inject_assumes.cc b/src/transform/inject_assumes.cc
index 485e270c3c1c49027367df474338a1771ba3cee5..2a5fc62ca2bb313d027440be26c89b86ee5d1f6f 100644
--- a/src/transform/inject_assumes.cc
+++ b/src/transform/inject_assumes.cc
@@ -1,4 +1,10 @@
+/*!
+ * \file inject_assumes.cc
+ * \brief Inject assumes on buffer's shape boundary check. Also convert
+ * existing assumes to AttrNodes.
+ */
 
+#include "common/assume.h"
 #include "tvm/arith/analyzer.h"
 #include "tvm/ffi/optional.h"
 #include "tvm/ir/expr.h"
@@ -6,9 +12,11 @@
 #include "tvm/node/structural_hash.h"
 #include "tvm/tir/builtin.h"
 #include "tvm/tir/expr.h"
+#include "tvm/tir/op.h"
 #include "tvm/tir/stmt.h"
 #include "tvm/tir/stmt_functor.h"
 #include "tvm/tir/transform.h"
+
 #include <sstream>
 
 namespace tvm::tl {
@@ -26,11 +34,12 @@ public:
   }
 
 private:
-  struct AssertCreator {
+  struct AssumeCreator {
     struct Item {
       PrimExpr expr;
       std::vector<Buffer> buffers;
     };
+
     tvm::StructuralHash sh;
     tvm::StructuralEqual se;
     // grouped by expr, since the amount of variadic shape symbols is usually
@@ -52,6 +61,7 @@ private:
         items[*it].buffers.push_back(buffer);
       }
     }
+
     void addBuffer(Buffer buf) {
       for (auto shape : buf->shape) {
         if (shape->IsInstance<IntImmNode>())
@@ -59,10 +69,12 @@ private:
         addExpr(shape, buf);
       }
     }
+
     Stmt build(Stmt body) {
       auto analyzer = arith::Analyzer{};
       for (const auto &e : items) {
-        auto simplified = analyzer.Simplify(GT(e.expr, 0));
+        auto simplified =
+            analyzer.Simplify(GT(e.expr, make_zero(e.expr->dtype)));
         std::stringstream ss;
         ss << "Buffer shape should be greater than 0: shape `" << e.expr
            << "` from buffer ";
@@ -77,32 +89,37 @@ private:
       return body;
     }
   };
+
   Stmt VisitStmt_(const DeclBufferNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
+    AssumeCreator c;
     c.addBuffer(op->buffer);
     return DeclBuffer(op->buffer, c.build(body), op->span);
   }
-  std::optional<PrimExpr> getAssumeExpr(Stmt stmt) {
-    auto eval = stmt.as<EvaluateNode>();
-    if (!eval)
-      return std::nullopt;
-    auto call = eval->value.as<CallNode>();
-    if (!call)
-      return std::nullopt;
-    if (!call->op.same_as(builtin::assume()))
-      return std::nullopt;
-    return call->args[0];
-  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
     struct AssumeGroup {
       std::optional<PrimExpr> e;
       std::vector<Stmt> stmts;
     };
     std::vector<AssumeGroup> groups = {AssumeGroup{std::nullopt, {}}};
-    for (auto i = 0; i < op->seq.size(); i++) {
+    for (size_t i = 0; i < op->seq.size(); i++) {
       auto stmt = VisitStmt(op->seq[i]);
-      if (auto e = getAssumeExpr(stmt)) {
+      // Convert assume in evaluate form to assume attribute.
+      // By default, we have the following IR:
+      //    T.assume(cond1)
+      //    Stmt1
+      //    Stmt2
+      //    T.assume(cond2)
+      // This SeqStmt will be converted to:
+      //    With(attr::tilelang_assume, cond1) {
+      //      Stmt1
+      //      Stmt2
+      //    }
+      //    With(attr::tilelang_assume, cond2) {
+      //      ...
+      //    }
+      if (auto e = GetAssumeExprInEvaluateForm(stmt)) {
         groups.push_back(AssumeGroup{*e, {}});
       } else {
         groups.back().stmts.push_back(stmt);
@@ -125,10 +142,14 @@ private:
                                        : SeqStmt(groups[0].stmts);
     // return SeqStmt(groups[0].stmts);
   }
+
   Stmt VisitStmt_(const BlockNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
-    if (root_node) {
+    AssumeCreator c;
+
+    // NOTE(chaofan): We only inject assumes from function arguments in the
+    // root block.
+    if (op->name_hint == "root") {
       for (auto item : f->buffer_map) {
         c.addBuffer(item.second);
       }
@@ -139,12 +160,13 @@ private:
     for (auto item : op->match_buffers) {
       c.addBuffer(item->buffer);
     }
+
     return Block(op->iter_vars, op->reads, op->writes, op->name_hint,
                  c.build(body), op->init, op->alloc_buffers, op->match_buffers,
                  op->annotations, op->span);
   }
+
   PrimFunc f;
-  bool root_node{true};
 };
 
 using namespace tir::transform;
diff --git a/src/transform/inject_pipeline.cc b/src/transform/inject_pipeline.cc
index 511ebc573e66add249a93186bc25ea41ad9bec83..79e78add94bad4c233d6a5de19fba939ed0065cb 100644
--- a/src/transform/inject_pipeline.cc
+++ b/src/transform/inject_pipeline.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file inject_software_pipeline.cc
  * \brief Transform annotated loops into pipelined one that parallelize
@@ -79,6 +60,8 @@ struct PipelineAnnotation {
   int stage;
   int order;
   bool async;
+  // Index of the statement in the original loop body order (SeqStmt order)
+  int original_idx = -1;
 };
 
 using PipelineInfo = std::unordered_map<Block, PipelineAnnotation,
@@ -304,15 +287,17 @@ public:
     }
 
     // Step 2: Emit the pipeline prologue, body and epilogue.
-    Stmt prologue = EmitImpl(pipeline_loop_->min,
-                             pipeline_loop_->min + max_stage_, true, true);
-    Stmt body =
-        EmitImpl(pipeline_loop_->min + max_stage_,
-                 pipeline_loop_->min + pipeline_loop_->extent, false, false);
-    Stmt epilogue = EmitImpl(
-        pipeline_loop_->min + pipeline_loop_->extent,
-        pipeline_loop_->min + pipeline_loop_->extent + max_stage_, true, true);
-
+    Stmt prologue =
+        EmitImpl(pipeline_loop_->min, pipeline_loop_->min + max_stage_, true,
+                 true, false);
+    Stmt body = EmitImpl(pipeline_loop_->min + max_stage_,
+                         pipeline_loop_->min + pipeline_loop_->extent, false,
+                         false, false);
+
+    Stmt epilogue =
+        EmitImpl(pipeline_loop_->min + pipeline_loop_->extent,
+                 pipeline_loop_->min + pipeline_loop_->extent + max_stage_,
+                 true, true, true);
     SeqStmt stmt = SeqStmt({prologue, body, epilogue});
 
     // Step 3: Make a new block that contains new buffer allocations after
@@ -515,12 +500,16 @@ private:
     // A symbolic expression representing the index the latest async operation
     // associated with this stage has written into, at the "current" iteration.
     Optional<PrimExpr> producer_head;
+    // the commit block's predicate
+    PrimExpr commit_predicate{nullptr};
   };
 
   /*! Structure holding intermediate information for pipeline loop rewriting. */
   struct RewrittenBlockInfo {
     int stage;
     int order;
+    PrimExpr start;
+    PrimExpr end;
     PrimExpr predicate;
     Block block;
     PrimExpr access_index;
@@ -528,56 +517,103 @@ private:
   };
 
   void PopulateWaitCounts(const std::vector<RewrittenBlockInfo> &new_blocks,
-                          std::map<int, AsyncStateLocal> *async_states_local) {
+                          std::map<int, AsyncStateLocal> *async_states_local,
+                          bool is_epilogue = false) {
+    // Precompute which orders are present in this emit, and their access_index
+    std::unordered_map<int, PrimExpr> order_to_access_index;
+    std::unordered_set<int> present_orders;
+    for (const auto &nb : new_blocks) {
+      order_to_access_index[nb.order] = nb.access_index;
+      present_orders.insert(nb.order);
+    }
     for (size_t i = 0; i < new_blocks.size(); ++i) {
+      // 1. Find the unique async producer stage
       int producer_stage_idx = -1;
-      for (auto read_region : new_blocks[i].block->reads) {
+      for (const auto &read_region : new_blocks[i].block->reads) {
         for (const auto &[stage, state] : async_states) {
           if (stage <= new_blocks[i].stage &&
               state.writes(read_region->buffer)) {
-            // Found an earlier stage where read_region->buffer was
-            // asynchronously written
+            // Currently only a single async stage dependency is supported
             ICHECK(producer_stage_idx == -1 || producer_stage_idx == stage)
                 << "A dependency on multiple async stages is not supported";
             producer_stage_idx = stage;
           }
         }
       }
-      if (producer_stage_idx == -1)
+      if (producer_stage_idx == -1) {
+        // This block does not depend on any async producer
         continue;
+      }
       const auto &state = async_states[producer_stage_idx];
+
       auto &dep_local_state = (*async_states_local)[producer_stage_idx];
-      PrimExpr in_flight_cnt = 0;
-      for (const auto &group : state.commit_groups) {
-        PrimExpr consumer_head = new_blocks[i].access_index;
-        PrimExpr producer_head;
-        if (dep_local_state.producer_head.defined()) {
-          producer_head = dep_local_state.producer_head.value();
-          // if the group is after the wait point, minus by 1
-          if (group.front() > new_blocks[i].order)
-            producer_head -= 1;
-        } else {
-          producer_head = state.producer_head;
-        }
-        in_flight_cnt += producer_head - consumer_head;
-      }
 
-      // We can relax the in-flight-count by the number of independent commit.
+      // 2. Use buffer_to_commit_group_ to find all actually dependent commit
+      // groups
       std::unordered_set<int> dependent_groups;
       for (const auto &read_region : new_blocks[i].block->reads) {
-        if (state.buffer_to_commit_group_.count(read_region->buffer.get()))
-          dependent_groups.insert(
-              state.buffer_to_commit_group_.at(read_region->buffer.get()));
+        auto it = state.buffer_to_commit_group_.find(read_region->buffer.get());
+        if (it != state.buffer_to_commit_group_.end()) {
+          dependent_groups.insert(it->second);
+        }
       }
-      for (int i = int(state.commit_groups.size()) - 1; i >= 0; i--) {
-        if (dependent_groups.count(i) == 0)
-          in_flight_cnt += 1;
-        else
-          break; // stop relaxing
+
+      // If there is no dependent commit group, no wait needs to be inserted
+      if (dependent_groups.empty()) {
+        continue;
+      }
+
+      // 3. Compute wait = max_g max(0, t_consumer - committed_before[g])
+      PrimExpr t_consumer = new_blocks[i].access_index;
+      PrimExpr wait_expr = make_zero(t_consumer.dtype());
+
+      PrimExpr current_head = dep_local_state.producer_head.defined()
+                                  ? dep_local_state.producer_head.value()
+                                  : state.producer_head;
+      int consumer_order = new_blocks[i].order;
+
+      for (int g : dependent_groups) {
+        const auto &group = state.commit_groups[g];
+        if (group.empty())
+          continue;
+        int commit_order = group.back();
+        bool commit_present = present_orders.count(commit_order) > 0;
+
+        PrimExpr committed_before;
+        if (commit_present && commit_order <= consumer_order) {
+          // Commit point is in this iteration and earlier than the current
+          // consumer; this iteration's head is visible
+          auto commit_predicate = dep_local_state.commit_predicate;
+          if (analyzer_.CanProve(!commit_predicate,
+                                 arith::ProofStrength::kSymbolicBound)) {
+            // it means the commit block is not executed in this iteration
+            committed_before = new_blocks[i].start - 1;
+          } else if (is_epilogue) {
+            committed_before = new_blocks[i].start - 1;
+          } else {
+            committed_before = order_to_access_index.at(commit_order);
+          }
+        } else {
+          // Commit point is later than the current consumer or not in this
+          // iteration; only the previous iteration's head is visible
+          if (dep_local_state.producer_head.defined()) {
+            auto commit_predicate = dep_local_state.commit_predicate;
+            if (analyzer_.CanProve(!commit_predicate,
+                                   arith::ProofStrength::kSymbolicBound)) {
+              committed_before = new_blocks[i].start - 1;
+            } else if (is_epilogue) {
+              committed_before = new_blocks[i].start - 1;
+            } else {
+              committed_before = current_head - 1;
+            }
+          }
+        }
+
+        wait_expr = analyzer_.Simplify(committed_before - t_consumer);
       }
-      in_flight_cnt = analyzer_.Simplify(in_flight_cnt);
-      dep_local_state.pending_waits.push_back(
-          {static_cast<int>(i), in_flight_cnt});
+
+      wait_expr = analyzer_.Simplify(wait_expr);
+      dep_local_state.pending_waits.push_back({static_cast<int>(i), wait_expr});
     }
   }
 
@@ -630,7 +666,7 @@ private:
    * \return The result loop.
    */
   Stmt EmitImpl(const PrimExpr &start, const PrimExpr &end, bool unroll_loop,
-                bool need_bound_check) {
+                bool need_bound_check, bool is_epilogue = false) {
     PrimExpr new_loop_var;
     PrimExpr extent = end - start;
     auto make_nop = []() {
@@ -642,7 +678,20 @@ private:
       new_loop_var = start; // use constants as the loop var for unit loops
     } else {
       new_loop_var = pipeline_loop_->loop_var.copy_with_suffix("");
-      analyzer_.Bind(Downcast<Var>(new_loop_var), Range(start, end));
+      // Bind the iteration domain [start, end) to strengthen analyzer facts.
+      analyzer_.Bind(Downcast<Var>(new_loop_var),
+                     Range::FromMinExtent(start, end - start));
+    }
+    // Keep the bound constraints active for all analysis below.
+    // Only meaningful when the loop var is symbolic (non-unit loop).
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_lb_guard;
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_ub_guard;
+    if (!is_unit_loop) {
+      Var loop_iter = Downcast<Var>(new_loop_var);
+      ctx_lb_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter >= start));
+      ctx_ub_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter < end));
     }
 
     std::vector<RewrittenBlockInfo> new_blocks;
@@ -653,15 +702,14 @@ private:
     for (const Block &block : ordered_stmts_) {
       int stage = pipeline_info_.at(block).stage;
       int order = pipeline_info_.at(block).order;
+
       PrimExpr inbound = Bool(true);
       PrimExpr skewed_loop_var = new_loop_var - stage;
       if (need_bound_check)
-        inbound =
-            analyzer_.Simplify(pipeline_loop_->min <= skewed_loop_var) &&
-            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent);
-      if (analyzer_.CanProve(!inbound)) {
-        continue;
-      }
+        inbound = And(
+            pipeline_loop_->min <= skewed_loop_var,
+            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent));
+
       Block new_block = Downcast<Block>(
           PipelineBodyRewriter(buffer_data_to_buffer_, buffer_remap_,
                                pipeline_loop_, max_stage_ != 1)(block));
@@ -674,6 +722,8 @@ private:
       PrimExpr normalized_access_index =
           is_unit_loop ? skewed_loop_var : skewed_loop_var + delta;
 
+      normalized_access_index = analyzer_.Simplify(normalized_access_index);
+
       // Adjust the block predicate and the body according to the final loop
       // bound
       //  [pipeline_loop_->min, extent).
@@ -701,17 +751,18 @@ private:
       if (pipeline_info_[block].async) {
         auto &local_state = async_states_local[stage];
         local_state.producer_head = normalized_access_index;
+        local_state.commit_predicate = inbound;
         BlockNode *n = new_block.CopyOnWrite();
         n->body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::async_scope,
                            1, n->body);
       }
 
-      new_blocks.push_back({stage, order, inbound, new_block,
+      new_blocks.push_back({stage, order, start, end, inbound, new_block,
                             normalized_access_index,
                             pipeline_info_[block].async});
     }
 
-    PopulateWaitCounts(new_blocks, &async_states_local);
+    PopulateWaitCounts(new_blocks, &async_states_local, is_epilogue);
 
     auto stmts = CompletePipelineLoopStatements(new_blocks, async_states_local);
 
@@ -1008,7 +1059,8 @@ private:
           pipeline_async_stages.find(stage) != pipeline_async_stages.end();
       PipelineAnnotation stage_order{
           stage,
-          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async};
+          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async,
+          /*original_idx=*/static_cast<int>(i)};
       pipeline_info.emplace(original_order[i], stage_order);
     }
 
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index bd726b3dbd4488d3b82a9fb0eee6d730c05692ce..1af8161474336c4df4e62d9916fb618b28b9c24c 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -12,12 +12,15 @@
 #include <tvm/tir/utils.h>
 
 #include <algorithm>
+#include <deque>
+#include <memory>
 #include <queue>
 
 #include "../layout/utils.h"
 #include "../op/copy.h"
 #include "../op/parallel.h"
 #include "../op/region.h"
+#include "../target/utils.h"
 
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
@@ -71,7 +74,7 @@ public:
 
   void RunInferStep(int cur_infer_id, InferLevel level, bool update_queue,
                     LayoutMap &layout_map, const LayoutMap &strict_layout_map,
-                    std::queue<int> &q, std::vector<bool> &in_queue) {
+                    std::deque<int> &q, std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
 
     // Range check for cur_infer_id
@@ -85,6 +88,7 @@ public:
     auto &next = infer_list_[cur_infer_id];
     auto iter_var = thread_var_vec_[cur_infer_id];
     auto thread_bounds = thread_bounds_vec_[cur_infer_id];
+    arith::Analyzer *cur_analyzer = analyzer_vec_[cur_infer_id].get();
     auto buffer_oob = buffer_oob_vec_[cur_infer_id];
     // Double-check that 'next' is valid
     ICHECK(next.defined()) << "infer_list_[" << cur_infer_id
@@ -106,13 +110,17 @@ public:
            "required for layout inference.";
 
     // Run InferLayout
-    auto updates =
-        next->InferLayout(LayoutInferArgs{target_, thread_bounds, layout_map,
-                                          &analyzer_, buffer_oob},
-                          level);
+    auto updates = next->InferLayout(LayoutInferArgs{target_,
+                                                     thread_bounds,
+                                                     layout_map,
+                                                     cur_analyzer,
+                                                     buffer_oob,
+                                                     {},
+                                                     let_var_to_expr_},
+                                     level);
+
     // Process the returned updates
     for (const auto &[buffer, layout] : updates) {
-
       // Basic validity checks
       ICHECK(buffer.defined()) << "InferLayout returned an undefined buffer.";
       ICHECK(layout.defined()) << "InferLayout returned an undefined layout.";
@@ -138,8 +146,11 @@ public:
             }
           }
           Layout target_layout =
-              shapes_equal ? src_layout
-                           : src_layout->Reshape(sib->shape, &analyzer_);
+              shapes_equal
+                  ? src_layout
+                  : src_layout->Reshape(sib->shape, &analyzer_,
+                                        Integer(src_buffer->dtype.bytes()),
+                                        Integer(sib->dtype.bytes()));
           if (layout_map.count(sib)) {
             ICHECK(target_layout->IsEqual(layout_map[sib].get()))
                 << "Get different layout for alias buffer " << sib
@@ -150,10 +161,7 @@ public:
             layout_map.Set(sib, target_layout);
             if (update_queue && use_list_.count(sib)) {
               for (int idx : use_list_[sib]) {
-                if (!in_queue[idx] && idx != cur_infer_id) {
-                  in_queue[idx] = true;
-                  q.push(idx);
-                }
+                EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
               }
             }
           }
@@ -231,22 +239,20 @@ public:
               << "Index in use_list_ for buffer " << buffer
               << " out of range: " << idx << " >= " << num_infer << ".";
 
-          if (!in_queue[idx] && idx != cur_infer_id) {
-            in_queue[idx] = true;
-            q.push(idx);
-          }
+          EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
         }
       }
     }
   };
 
   void FinishInferQueue(InferLevel level, LayoutMap &layout_map,
-                        const LayoutMap &strict_layout_map, std::queue<int> &q,
+                        const LayoutMap &strict_layout_map, std::deque<int> &q,
                         std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
+
     while (!q.empty()) {
       int cur_infer_id = q.front();
-      q.pop();
+      q.pop_front();
       // Range check again, just to be safe
       ICHECK_GE(cur_infer_id, 0);
       ICHECK_LT(cur_infer_id, num_infer);
@@ -266,6 +272,9 @@ public:
     ICHECK_EQ(thread_bounds_vec_.size(), infer_list_.size())
         << "Size mismatch: thread_bounds_vec_ and infer_list_ must match in "
            "length.";
+    ICHECK_EQ(analyzer_vec_.size(), infer_list_.size())
+        << "Size mismatch: analyzer_vec_ and infer_list_ must match in "
+           "length.";
     ICHECK_EQ(buffer_oob_vec_.size(), infer_list_.size())
         << "Size mismatch: buffer_oob_vec_ and infer_list_ must match in "
            "length.";
@@ -284,7 +293,7 @@ public:
     int num_infer = infer_list_.size();
 
     // Prepare BFS queue for iterative inference
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(num_infer, true);
     for (int i = 0; i < num_infer; i++) {
       // Check that each infer_list_ entry is valid
@@ -296,7 +305,7 @@ public:
       if (!thread_var_vec_[i].defined() && skip_thread_partition_) {
         thread_var_vec_[i] = thread_var_;
       }
-      q.push(i);
+      q.push_back(i);
     }
 
     // step 1: infer strict layout
@@ -347,10 +356,12 @@ public:
             }
           }
 
-          Layout reshaped =
-              shapes_equal
-                  ? rep_layout.value()
-                  : rep_layout.value()->Reshape(buf->shape, &analyzer_);
+          Layout reshaped = shapes_equal
+                                ? rep_layout.value()
+                                : rep_layout.value()->Reshape(
+                                      buf->shape, &analyzer_,
+                                      Integer(rep.value()->dtype.bytes()),
+                                      Integer(buf->dtype.bytes()));
           layout_map.Set(buf, reshaped);
         }
       }
@@ -426,18 +437,56 @@ private:
     return buffer_map;
   }
 
+  // Return true if all buffers that this op (idx) touches already have
+  // inferred layouts in layout_map. Used to prioritize enqueue order.
+  bool ShouldPrioritize(int idx, const LayoutMap &layout_map) const {
+    auto it = op_touched_buffers_.find(idx);
+    if (it == op_touched_buffers_.end() || it->second.empty())
+      return false;
+    for (const auto &buf : it->second) {
+      if (!layout_map.count(buf))
+        return false;
+    }
+    return true;
+  }
+
+  // Enqueue idx to q with priority if all its buffers already
+  // have layouts. Also guards against duplicates and self-enqueue.
+  void EnqueueWithPriority(int idx, std::deque<int> &q,
+                           std::vector<bool> &in_queue, int cur_infer_id,
+                           const LayoutMap &layout_map) const {
+    if (idx == cur_infer_id)
+      return;
+    if (idx < 0 || idx >= static_cast<int>(in_queue.size()))
+      return;
+    if (in_queue[idx])
+      return;
+    in_queue[idx] = true;
+    if (ShouldPrioritize(idx, layout_map)) {
+      q.push_front(idx);
+    } else {
+      q.push_back(idx);
+    }
+  }
+
   void VisitExpr_(const CallNode *op) final {
     IRVisitorWithAnalyzer::VisitExpr_(op);
     // Do not analysis the call node to the global function.
     if (op->op.as<GlobalVarNode>())
       return;
 
-    auto p = ParseOperator(tvm::ffi::GetRef<Call>(op), GetBufferMap());
+    auto p = ParseOperator(tvm::ffi::GetRef<Call>(op));
     if (p.defined()) {
       for (const auto &arg : op->args) {
         if (auto buffer = getBufferFromAccessPtr(arg)) {
           addToUseList(buffer.value());
+        } else if (auto buffer = getBufferFromRegion(arg)) {
+          addToUseList(buffer.value());
         }
+        // Check if the argument uses any LetStmt variables that reference
+        // fragment buffers. If so, add those buffers to the use list.
+        // This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0], ...)
+        CollectFragmentBuffersFromExpr(arg);
       }
       // Compute thread_var_ and thread_bounds_
       thread_var_vec_.push_back(thread_var_);
@@ -452,6 +501,7 @@ private:
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
 
       // Compute buffer oob for each buffer in the op
       if (const auto *copy = p.as<CopyNode>()) {
@@ -489,6 +539,9 @@ private:
   }
 
   Optional<Buffer> getBufferFromAccessPtr(const PrimExpr &expr) {
+    if (auto bl = expr.as<BufferLoadNode>()) {
+      return bl->buffer;
+    }
     auto call = expr.as<CallNode>();
     if (!call) {
       return std::nullopt;
@@ -508,18 +561,45 @@ private:
         }
       }
       return std::nullopt;
-    } else if (call->op.same_as(RegionOp::Get())) {
-      return call->args[0].as<BufferLoadNode>()->buffer;
+    }
+    return std::nullopt;
+  }
+
+  Optional<Buffer> getBufferFromRegion(const PrimExpr &expr) {
+    if (auto call = expr.as<CallNode>()) {
+      if (call->op.same_as(RegionOp::Get())) {
+        if (auto bl = call->args[0].as<BufferLoadNode>()) {
+          return bl->buffer;
+        }
+        return std::nullopt;
+      }
     }
     return std::nullopt;
   }
 
   void addToUseList(const Buffer &buffer) {
+    // buffer scope must be local.fragment
+    if (buffer.scope() != "local.fragment") {
+      return;
+    }
     int infer_idx = infer_list_.size();
     if (use_list_.find(buffer) == use_list_.end()) {
       use_list_[buffer] = {};
     }
     use_list_[buffer].push_back(infer_idx);
+
+    // Track which buffers this op (infer_idx) touches for prioritization.
+    // Avoid duplicates.
+    auto &vec = op_touched_buffers_[infer_idx];
+    bool exists = false;
+    for (const auto &b : vec) {
+      if (b.same_as(buffer)) {
+        exists = true;
+        break;
+      }
+    }
+    if (!exists)
+      vec.push_back(buffer);
   }
 
   void VisitStmt_(const ForNode *op) final {
@@ -528,6 +608,71 @@ private:
       for (const auto &[buffer, _] : infer->GetIndiceMap()) {
         addToUseList(buffer);
       }
+
+      PostOrderVisit(op->body, [this](const ObjectRef &node) {
+        if (auto *buffer_load = node.as<BufferLoadNode>()) {
+          if (buffer_load->buffer.defined() &&
+              buffer_load->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_load->buffer->data)) {
+              // Check if this buffer is already in the list
+              auto buffers = buffer_data_to_buffers_[buffer_load->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_load->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_load->buffer);
+                buffer_data_to_buffers_.Set(buffer_load->buffer->data, buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_load->buffer
+                           << " buffer.get() = " << buffer_load->buffer.get()
+                           << " data = " << buffer_load->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_load->buffer->data,
+                                          {buffer_load->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_load->buffer
+                         << " buffer.get() = " << buffer_load->buffer.get()
+                         << " data = " << buffer_load->buffer->data.get();
+            }
+          }
+        } else if (auto *buffer_store = node.as<BufferStoreNode>()) {
+          if (buffer_store->buffer.defined() &&
+              buffer_store->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_store->buffer->data)) {
+              auto buffers =
+                  buffer_data_to_buffers_[buffer_store->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_store->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_store->buffer);
+                buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                            buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_store->buffer
+                           << " buffer.get() = " << buffer_store->buffer.get()
+                           << " data = " << buffer_store->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                          {buffer_store->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_store->buffer
+                         << " buffer.get() = " << buffer_store->buffer.get()
+                         << " data = " << buffer_store->buffer->data.get();
+            }
+          }
+        }
+      });
       infer_list_stmt_.push_back(tvm::ffi::GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(infer));
       thread_var_vec_.push_back(thread_var_);
@@ -542,6 +687,7 @@ private:
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
       buffer_oob_vec_.push_back(false);
     } else {
       IRVisitorWithAnalyzer::VisitStmt(op->body);
@@ -593,7 +739,11 @@ private:
           if (shapes_equal) {
             annotated_layout_map_.Set(buffer, layout);
           } else {
-            auto reshaped_layout = layout->Reshape(buffer->shape, &analyzer_);
+            // Use the first buffer sharing this var as the base for dtype ratio
+            int base_bytes = buffers[0]->dtype.bytes();
+            auto reshaped_layout =
+                layout->Reshape(buffer->shape, &analyzer_, Integer(base_bytes),
+                                Integer(buffer->dtype.bytes()));
             annotated_layout_map_.Set(buffer, reshaped_layout);
           }
         }
@@ -612,6 +762,30 @@ private:
     IRVisitorWithAnalyzer::VisitStmt_(op);
   }
 
+  void VisitStmt_(const LetStmtNode *op) final {
+    // Record Let variable to its bound expression.
+    // This enables tracking fragment buffer accesses through let bindings.
+    let_var_to_expr_.Set(op->var, op->value);
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+  // Helper: recursively collect fragment buffers from an expression,
+  // following let bindings chain.
+  void CollectFragmentBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (bl->buffer.defined() && bl->buffer.scope() == "local.fragment") {
+          addToUseList(bl->buffer);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr_.count(var)) {
+          CollectFragmentBuffersFromExpr(let_var_to_expr_[var]);
+        }
+      }
+    });
+  }
+
   void VisitExpr_(const BufferLoadNode *op) final {
     // Collect buffer from BufferLoad
     if (op->buffer.defined() && op->buffer->data.defined()) {
@@ -673,16 +847,21 @@ private:
   }
 
   Map<Var, Array<Buffer>> buffer_data_to_buffers_;
+  // Map from LetStmt variable to its bound expression
+  Map<Var, PrimExpr> let_var_to_expr_;
   std::vector<ObjectRef> infer_list_stmt_;
   std::vector<TileOperator> infer_list_;
   std::unordered_map<Buffer, std::vector<int>, ObjectPtrHash, ObjectPtrEqual>
       use_list_;
+  // Per-op list of buffers it touches (fragment scope), used for prioritization
+  std::unordered_map<int, std::vector<Buffer>> op_touched_buffers_;
   // This is a workaround for cpu backend,
   // we need to define a thread_var for the serial loop.
   IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
                                 IterVarType::kDataPar);
   std::vector<IterVar> thread_var_vec_;
   std::vector<Range> thread_bounds_vec_;
+  std::vector<std::unique_ptr<arith::Analyzer>> analyzer_vec_;
   std::vector<bool> buffer_oob_vec_;
   Target target_;
   LayoutMap annotated_layout_map_;
@@ -742,6 +921,7 @@ private:
         }
       }
     }
+
     std::unordered_map<int, std::vector<int>> components;
     for (int i = 0; i < infer_list_.size(); i++) {
       int root = uf.Find(i);
@@ -758,7 +938,7 @@ private:
 
     // For each component, try each op as root, and determine the least
     // replicated one
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(infer_list_.size(), false);
 
     for (auto &&[root, members] : components) {
@@ -772,7 +952,7 @@ private:
       // Try each member as the root of inference for this component
       for (int attempt_infer_root : members) {
         DLOG(INFO) << "----------------------- try root " << attempt_infer_root
-                   << '\n';
+                   << " members " << members.size() << '\n';
         // Backup the current infer_list_ state
         auto back_infer_list = BackupInferList();
         // Copy the current layout_map for temporary use
@@ -803,6 +983,10 @@ private:
           do_update = false;
           DLOG(INFO) << "attempt failed due to NormalizeIterException "
                      << e.what() << '\n';
+        } catch (const LoopLayoutInjectiveException &e) {
+          do_update = false;
+          DLOG(INFO) << "attempt failed due to LoopLayoutInjectiveException "
+                     << e.what() << '\n';
         }
 
         if (do_update) {
@@ -813,7 +997,13 @@ private:
               int64_t frag_reg_num = 1;
               for (auto i : frag.value()->OutputShape()) {
                 auto pci = as_const_int(i);
-                ICHECK(pci != nullptr);
+                ICHECK(pci != nullptr)
+                    << "Can not use non-constant range to "
+                       "iterate over a fragment/local "
+                       "buffer. Non-constant shape expr is: "
+                    << i
+                    << ". This is possibly because you use symbolic shape when "
+                       "accessing a fragment/local buffer.";
                 frag_reg_num *= *pci;
               }
               reg_num += frag_reg_num;
@@ -934,106 +1124,115 @@ private:
       reducer_info = op->annotations.Get(attr::kReducerInfo)
                          ->as<Map<Var, ReducerInfo>>()
                          .value();
-
+    if (!result_.for_map.count(tvm::ffi::GetRef<For>(op))) {
+      return IRMutatorWithAnalyzer::VisitStmt_(op);
+    }
+    // the analyzer will be modified in PartitionLoop and VectorizeLoop
+    // we need to save its state to prevent conflicted bindings
+    auto saved_analyzer = analyzer_->Clone();
     For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    if (result_.for_map.count(tvm::ffi::GetRef<For>(op))) {
-      auto root = tvm::ffi::GetRef<For>(op);
-      // This check is a workaround to support T.Parallel for local buffers.
-      // For example:
-      //   for i in T.Parallel(1024):
-      //     A_local[i] = A_global[i]
-      // Here, A_local is a register-local buffer held independently by each
-      // thread, so explicit thread binding is not required.
-      bool store_into_local = false;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() == "local") {
-            store_into_local = true;
-          }
-          // if the case is like:
-          // for i in T.Parallel(1024):
-          //     A_local[i] = B_global[i]
-          //     A_frag[i] = A_global[i]
-          // exception will be raise in Parallel::LayoutInference
+    auto root = tvm::ffi::GetRef<For>(op);
+    // This check is a workaround to support T.Parallel for local buffers.
+    // For example:
+    //   for i in T.Parallel(1024):
+    //     A_local[i] = A_global[i]
+    // Here, A_local is a register-local buffer held independently by each
+    // thread, so explicit thread binding is not required.
+    bool store_into_local = false;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (store->buffer.scope() == "local") {
+          store_into_local = true;
         }
-      });
-      // This check if for the loop that only manuplates "local" buffers,
-      // for i in T.Parallel(1024):
-      //     A_local[i] = B_local[i]
-      // Though this might be illegal
-      // We use PostOrderVisit to detect whether the loop only manuplates
-      // "local" buffers, which indicates register usage and justifies skipping
-      // thread binding.
-      bool local_register_only = true;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() != "local") {
-            local_register_only = false;
-          }
-        } else if (const auto *load = obj.as<BufferLoadNode>()) {
-          if (load->buffer.scope() != "local") {
-            local_register_only = false;
-          }
+        // if the case is like:
+        // for i in T.Parallel(1024):
+        //     A_local[i] = B_global[i]
+        //     A_frag[i] = A_global[i]
+        // exception will be raise in Parallel::LayoutInference
+      }
+    });
+    // This check if for the loop that only manuplates "local" buffers,
+    // for i in T.Parallel(1024):
+    //     A_local[i] = B_local[i]
+    // Though this might be illegal
+    // We use PostOrderVisit to detect whether the loop only manuplates
+    // "local" buffers, which indicates register usage and justifies skipping
+    // thread binding.
+    bool local_register_only = true;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (store->buffer.scope() != "local") {
+          local_register_only = false;
         }
-      });
+      } else if (const auto *load = obj.as<BufferLoadNode>()) {
+        if (load->buffer.scope() != "local") {
+          local_register_only = false;
+        }
+      }
+    });
 
-      auto loop_layout = result_.for_map[root];
-      // FIXME: tell in-Parallel and out-of-Parallel `local`s apart
-      // NOTE(lei): a bit ugly, we should rethink about this part in future.
-      bool parallel_loop =
-          !skip_thread_partition_ && !local_register_only && !store_into_local;
+    auto loop_layout = result_.for_map[root];
+    // FIXME: tell in-Parallel and out-of-Parallel `local`s apart
+    // NOTE(lei): a bit ugly, we should rethink about this part in future.
+    bool parallel_loop =
+        !skip_thread_partition_ && !local_register_only && !store_into_local;
 
-      if (parallel_loop) {
-        for_node =
-            PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
-      }
-      // If none thread bindings are provided, partition the loop
-      bool has_non_local = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (const auto *load = obj.as<BufferLoadNode>()) {
-          String scope = load->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
-        } else if (const auto *store = obj.as<BufferStoreNode>()) {
-          String scope = store->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
+    if (parallel_loop) {
+      for_node =
+          PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
+    }
+    // If none thread bindings are provided, partition the loop
+    bool has_non_local = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *load = obj.as<BufferLoadNode>()) {
+        String scope = load->buffer.scope();
+        if (scope != "local" && scope != "local.fragment") {
+          has_non_local = true;
         }
-      });
-      // Workaround: if reducer is presented, don't vectorize loop
-      // Best solution should be isolate reduction axis out of vectorization
-      bool has_reducer = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (!has_reducer)
-          if (const auto *store = obj.as<BufferStoreNode>()) {
-            has_reducer = reducer_info.count(store->buffer->data) != 0;
-          }
-      });
-
-      // If a cast operation exists, vectorization may still be required
-      bool has_cast_operations = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      } else if (const auto *store = obj.as<BufferStoreNode>()) {
+        String scope = store->buffer.scope();
+        if (scope != "local" && scope != "local.fragment") {
+          has_non_local = true;
+        }
+      }
+    });
+    // Workaround: if reducer is presented, don't vectorize loop
+    // Best solution should be isolate reduction axis out of vectorization
+    bool has_reducer = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (!has_reducer)
         if (const auto *store = obj.as<BufferStoreNode>()) {
-          // Check if this is a non-reducer store with Cast operation
-          if (store->value.as<CastNode>()) {
-            has_cast_operations = true;
-          }
+          has_reducer = reducer_info.count(store->buffer->data) != 0;
+        }
+    });
+
+    // If a cast operation exists, vectorization may still be required
+    bool has_cast_operations = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *cast = obj.as<CastNode>()) {
+        // Check if this is a non-reducer store with Cast operation
+        DataType src_type = cast->value.dtype();
+        DataType dst_type = cast->dtype;
+        bool src_ok =
+            src_type.is_float() || src_type.is_bfloat() || src_type.is_float8();
+        bool dst_ok =
+            dst_type.is_float() || dst_type.is_bfloat() || dst_type.is_float8();
+        if (src_ok && dst_ok && TargetIsCuda(Target::Current())) {
+          has_cast_operations = true;
         }
-      });
-
-      if ((has_non_local || has_cast_operations) && !has_reducer) {
-        for_node = VectorizeLoop(for_node);
       }
+    });
 
-      if (result_.predicate_map.count(root) && parallel_loop) {
-        return IfThenElse(result_.predicate_map[root], for_node);
-      } else {
-        return for_node;
-      }
+    if ((has_non_local || has_cast_operations) && !has_reducer) {
+      DLOG(INFO) << "Try to vectorize loop";
+      for_node = VectorizeLoop(for_node, saved_analyzer.get());
+    }
+
+    if (result_.predicate_map.count(root) && parallel_loop) {
+      return IfThenElse(result_.predicate_map[root], for_node);
+    } else {
+      return for_node;
     }
-    return for_node;
   }
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
diff --git a/src/transform/layout_reducer.cc b/src/transform/layout_reducer.cc
index a3c69c43c1614763d1d6ec9d9ed825ad143ae82e..957918c971b9fb30791065d01666c8cd6953303f 100644
--- a/src/transform/layout_reducer.cc
+++ b/src/transform/layout_reducer.cc
@@ -213,8 +213,7 @@ private:
         const auto &buffer = opt_buffer.value();
         Fragment f;
         if (info->rep == ReducerRepType::ALL) {
-          f = Fragment(buffer->shape, {}, ReplicationPlaceholder(),
-                       thread_extent, std::nullopt);
+          f = Fragment::FullyReplicated(buffer->shape, thread_extent);
         } else if (info->rep == ReducerRepType::NONE) {
           PrimExpr flatten_idx = InputPlaceholder(0);
           for (int i = 1; i < buffer->shape.size(); ++i)
@@ -277,7 +276,7 @@ private:
     if (op->op.same_as(Fill::Get())) {
       ICHECK(!op->args.empty());
       if (auto arg0_call = op->args[0].as<Call>()) {
-        // Case 1: tl.region(...) — extract buffer var from its first arg
+        // tl.region(...) — extract buffer var from its first arg
         if (arg0_call.value()->op.same_as(RegionOp::Get())) {
           ICHECK(!arg0_call.value()->args.empty());
           if (auto bl = arg0_call.value()->args[0].as<BufferLoadNode>()) {
@@ -285,15 +284,14 @@ private:
             if (reducer_info_map_.count(var)) {
               ICHECK(inside_reducer_range_.count(var) == 0)
                   << "T.fill on reducer must be enclosed with a "
-                     "T.finalize_reducer "
-                     "before next.";
+                     "T.finalize_reducer before next.";
               inside_reducer_range_.Set(var,
                                         reducer_info_map_.Get(var).value());
             }
           }
         }
-        // Case 2: builtin.tvm_access_ptr(...) — existing path
-        else if (arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
+        // builtin.tvm_access_ptr(...) — existing path (legacy)
+        if (arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
           ICHECK(arg0_call.value()->args.size() > 1);
           if (auto var = arg0_call.value()->args[1].as<Var>();
               var && reducer_info_map_.count(var.value())) {
@@ -305,10 +303,33 @@ private:
                 var.value(), reducer_info_map_.Get(var.value()).value());
           }
         }
+      } else if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        Var var = bl->buffer->data;
+        if (reducer_info_map_.count(var)) {
+          ICHECK(inside_reducer_range_.count(var) == 0)
+              << "T.fill on reducer must be enclosed with a T.finalize_reducer "
+                 "before next.";
+          inside_reducer_range_.Set(var, reducer_info_map_.Get(var).value());
+        }
       }
     } else if (op->op.same_as(FinalizeReducerOp::Get())) {
       ICHECK(op->args.size() == 1);
-      auto var = GetVarFromAccessPtr(op->args[0]);
+      Var var;
+      if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        var = bl->buffer->data;
+      } else if (auto reg_call = op->args[0].as<Call>()) {
+        if (reg_call.value()->op.same_as(RegionOp::Get())) {
+          if (auto bl2 = reg_call.value()->args[0].as<BufferLoadNode>()) {
+            var = bl2->buffer->data;
+          } else {
+            LOG(FATAL) << "tl.region expects BufferLoad as first arg";
+          }
+        } else {
+          var = GetVarFromAccessPtr(op->args[0]);
+        }
+      } else {
+        var = GetVarFromAccessPtr(op->args[0]);
+      }
       ICHECK(inside_reducer_range_.count(var) == 1)
           << "T.finalize_reducer must have a pairing T.fill ahead of it, "
              "enclosing a reduction range.";
diff --git a/src/transform/legalize_negative_index.cc b/src/transform/legalize_negative_index.cc
index b502a6fbaa123b395e22cf993d4599216fdce51b..f0df555ef5e66e84d227667200982efa9c6e0da6 100644
--- a/src/transform/legalize_negative_index.cc
+++ b/src/transform/legalize_negative_index.cc
@@ -1,6 +1,6 @@
 /*!
  * \file legalize_negative_index.cc
- * \brief Legalize negative indices in buffer load expressions.
+ * \brief Legalize negative indices in buffer load/store expressions.
  */
 
 #include <tvm/ffi/reflection/registry.h>
@@ -10,6 +10,7 @@
 #include <tvm/tir/transform.h>
 
 #include <unordered_map>
+#include <variant>
 #include <vector>
 
 #include "arith/ir_mutator_with_analyzer.h"
@@ -23,47 +24,42 @@ using arith::IRVisitorWithAnalyzer;
 
 enum class IndexSignState { kNonNegative, kNegative, kUnknown };
 
+using BufferAccessVariant =
+    std::variant<const BufferLoadNode *, const BufferStoreNode *>;
+using LoadStore2StateMap =
+    std::unordered_map<BufferAccessVariant, std::vector<IndexSignState>>;
+
 class NegativeIndexAnalyzer : public IRVisitorWithAnalyzer {
 public:
-  explicit NegativeIndexAnalyzer(
-      std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-          *result)
+  explicit NegativeIndexAnalyzer(LoadStore2StateMap *result)
       : result_(result) {}
 
-  void VisitExpr_(const BufferLoadNode *op) final {
-    auto load = tvm::ffi::GetRef<BufferLoad>(op);
+private:
+  std::vector<IndexSignState> ProcessIdx(const ffi::Array<PrimExpr> &indices,
+                                         ffi::String buffer_name) {
     std::vector<IndexSignState> states;
-    states.reserve(op->indices.size());
-    bool needs_record = false;
+    states.reserve(indices.size());
 
-    for (size_t i = 0; i < op->indices.size(); ++i) {
-      PrimExpr simplified = analyzer_.Simplify(op->indices[i]);
+    for (size_t i = 0; i < indices.size(); ++i) {
+      PrimExpr simplified = analyzer_.Simplify(indices[i]);
+      IndexSignState state = IndexSignState::kUnknown;
 
       // Handle scalar indices with the standard analyzer
       if (simplified.dtype().lanes() == 1) {
-        if (analyzer_.CanProve(simplified >= 0)) {
-          states.push_back(IndexSignState::kNonNegative);
-          continue;
-        }
-        if (analyzer_.CanProve(simplified < 0)) {
-          states.push_back(IndexSignState::kNegative);
-          needs_record = true;
-          continue;
-        }
-        states.push_back(IndexSignState::kUnknown);
-        needs_record = true;
-        DLOG(WARNING)
-            << "LegalizeNegativeIndex: cannot prove non-negative index "
-            << simplified << " for buffer " << load->buffer->name << " (axis "
-            << i << ").";
-        continue;
+        if (analyzer_.CanProve(simplified >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(simplified < 0))
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
       }
-
       // Vector indices: try to reason about non-negativity/negativity
       // Common patterns are Ramp(base, stride, lanes) and Broadcast(value,
       // lanes).
-      IndexSignState vec_state = IndexSignState::kUnknown;
-      if (const auto *ramp = simplified.as<RampNode>()) {
+      else if (const auto *ramp = simplified.as<RampNode>()) {
         // Compute a safe lower/upper bound for the vector lanes
         // lower_bound = base_min + min(0, stride_min) * (lanes - 1)
         // upper_bound = base_max + max(0, stride_max) * (lanes - 1)
@@ -85,118 +81,129 @@ public:
         if (s_max > 0)
           upper += s_max * (lanes - 1);
 
-        if (lower >= 0) {
-          vec_state = IndexSignState::kNonNegative;
-        } else if (upper < 0) {
-          vec_state = IndexSignState::kNegative;
-        } else {
-          vec_state = IndexSignState::kUnknown;
-        }
-      } else if (const auto *bc = simplified.as<BroadcastNode>()) {
-        auto v = analyzer_.Simplify(bc->value);
-        if (analyzer_.CanProve(v >= 0)) {
-          vec_state = IndexSignState::kNonNegative;
-        } else if (analyzer_.CanProve(v < 0)) {
-          vec_state = IndexSignState::kNegative;
-        } else {
+        if (lower >= 0)
+          state = IndexSignState::kNonNegative;
+        else if (upper < 0)
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
+      } else if (const auto *broadcast = simplified.as<BroadcastNode>()) {
+        auto v = analyzer_.Simplify(broadcast->value);
+        if (analyzer_.CanProve(v >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(v < 0))
+          state = IndexSignState::kNegative;
+        else {
           // Try const bound if proof unavailable
           auto vb = analyzer_.const_int_bound(v);
-          if (vb->min_value >= 0) {
-            vec_state = IndexSignState::kNonNegative;
-          } else if (vb->max_value < 0) {
-            vec_state = IndexSignState::kNegative;
-          } else {
-            vec_state = IndexSignState::kUnknown;
-          }
+          if (vb->min_value >= 0)
+            state = IndexSignState::kNonNegative;
+          else if (vb->max_value < 0)
+            state = IndexSignState::kNegative;
+          else
+            DLOG(WARNING)
+                << "LegalizeNegativeIndex: cannot prove non-negative index "
+                << simplified << " for buffer " << buffer_name << " (axis " << i
+                << ", index " + indices[i]->Script() + ").";
         }
       }
+      states.push_back(state);
+    }
 
-      if (vec_state == IndexSignState::kNonNegative) {
-        states.push_back(IndexSignState::kNonNegative);
-        continue;
-      }
-      if (vec_state == IndexSignState::kNegative) {
-        states.push_back(IndexSignState::kNegative);
-        needs_record = true;
-        continue;
-      }
+    return std::move(states);
+  }
 
-      states.push_back(IndexSignState::kUnknown);
-      needs_record = true;
-      DLOG(WARNING) << "LegalizeNegativeIndex: cannot prove non-negative index "
-                    << simplified << " for buffer " << load->buffer->name
-                    << " (axis " << i << ").";
-    }
+  bool NeedRecord(const std::vector<IndexSignState> &states) {
+    return std::any_of(states.begin(), states.end(),
+                       [](const IndexSignState &state) {
+                         return state == IndexSignState::kUnknown ||
+                                state == IndexSignState::kNegative;
+                       });
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
 
-    if (needs_record) {
+    if (NeedRecord(states))
       (*result_)[op] = std::move(states);
-    }
 
     IRVisitorWithAnalyzer::VisitExpr_(op);
   }
 
+  void VisitStmt_(const BufferStoreNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
+
+    if (NeedRecord(states))
+      (*result_)[op] = std::move(states);
+
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
 private:
-  std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      *result_;
+  LoadStore2StateMap *result_;
 };
 
 class NegativeIndexRewriter : public arith::IRMutatorWithAnalyzer {
 public:
-  static PrimFunc
-  Apply(PrimFunc func,
-        const std::unordered_map<const BufferLoadNode *,
-                                 std::vector<IndexSignState>> &states) {
+  static PrimFunc Apply(PrimFunc func, const LoadStore2StateMap &states) {
     arith::Analyzer analyzer;
     NegativeIndexRewriter rewriter(&analyzer, states);
-    if (!func->body.defined()) {
-      return func;
-    }
     PrimFuncNode *func_node = func.CopyOnWrite();
     func_node->body = rewriter.VisitStmt(func_node->body);
     return func;
   }
 
 private:
-  NegativeIndexRewriter(
-      arith::Analyzer *analyzer,
-      const std::unordered_map<const BufferLoadNode *,
-                               std::vector<IndexSignState>> &states)
+  NegativeIndexRewriter(arith::Analyzer *analyzer,
+                        const LoadStore2StateMap &states)
       : arith::IRMutatorWithAnalyzer(analyzer), states_(states) {}
 
+  ffi::Array<PrimExpr> UpdateIdx(const ffi::Array<PrimExpr> &indices,
+                                 const ffi::Array<PrimExpr> &buffer_shape,
+                                 const std::vector<IndexSignState> &state_vec) {
+    ICHECK_EQ(state_vec.size(), indices.size())
+        << "State vector size mismatch for buffer load/store indices ("
+        << indices << ")";
+    ffi::Array<PrimExpr> new_indices = indices;
+    for (size_t i = 0; i < indices.size(); ++i) {
+      if (state_vec[i] != IndexSignState::kNegative)
+        continue;
+      new_indices.Set(i, analyzer_->Simplify(buffer_shape[i] + indices[i]));
+    }
+    return new_indices;
+  }
+
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
     BufferLoad load =
         Downcast<BufferLoad>(arith::IRMutatorWithAnalyzer::VisitExpr_(op));
 
     auto it = states_.find(op);
-    if (it == states_.end()) {
+    if (it == states_.end())
       return load;
-    }
 
-    auto indices = load->indices;
-    bool changed = false;
-
-    const auto &state_vector = it->second;
-    ICHECK_EQ(state_vector.size(), indices.size())
-        << "State vector size mismatch for buffer load " << load->buffer->name;
+    auto indices = UpdateIdx(load->indices, load->buffer->shape, it->second);
+    return BufferLoad(load->buffer, indices, load->predicate);
+  }
 
-    for (size_t i = 0; i < indices.size(); ++i) {
-      if (state_vector[i] != IndexSignState::kNegative) {
-        continue;
-      }
-      PrimExpr extent = load->buffer->shape[i];
-      indices.Set(i, analyzer_->Simplify(extent + indices[i]));
-      changed = true;
-    }
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    BufferStore store =
+        Downcast<BufferStore>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    if (!changed) {
-      return load;
-    }
+    auto it = states_.find(op);
+    if (it == states_.end())
+      return store;
 
-    return BufferLoad(load->buffer, indices);
+    auto indices = UpdateIdx(store->indices, store->buffer->shape, it->second);
+    return BufferStore(store->buffer, store->value, indices, store->predicate);
   }
 
-  const std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      &states_;
+private:
+  const LoadStore2StateMap &states_;
 };
 
 PrimFunc LegalizeNegativeIndex(PrimFunc func) {
@@ -204,8 +211,7 @@ PrimFunc LegalizeNegativeIndex(PrimFunc func) {
     return func;
   }
 
-  std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      states;
+  LoadStore2StateMap states;
   NegativeIndexAnalyzer analyzer(&states);
   analyzer(func->body);
   if (states.empty()) {
diff --git a/src/transform/legalize_safe_memory_access.cc b/src/transform/legalize_safe_memory_access.cc
index 68a0cdbb8e911ee58e3457d5d72cd74da0d2d5e2..1a9da919c7c725be1e60ff20d8e528ef5faca77d 100644
--- a/src/transform/legalize_safe_memory_access.cc
+++ b/src/transform/legalize_safe_memory_access.cc
@@ -24,32 +24,6 @@ namespace tl {
 using namespace tir;
 using arith::IRMutatorWithAnalyzer;
 
-// Helper class to find leaf For nodes in a given IR
-class LeafForFinder : public StmtVisitor {
-public:
-  std::vector<For> leaf_for_nodes;
-
-private:
-  void VisitStmt_(const ForNode *op) final {
-    has_child_for_ = false;
-    bool parent_has_child_for = parent_has_child_for_;
-    parent_has_child_for_ = false;
-
-    StmtVisitor::VisitStmt(op->body);
-
-    if (!has_child_for_) {
-      leaf_for_nodes.push_back(tvm::ffi::GetRef<For>(op));
-    }
-
-    parent_has_child_for_ = parent_has_child_for;
-    parent_has_child_for_ = true;
-  }
-
-private:
-  bool has_child_for_ = false;
-  bool parent_has_child_for_ = false;
-};
-
 // GlobalMemChecker for a BufferLoad/BufferStore node:
 // 1. Identify BufferLoad and BufferStore nodes.
 // 2. Check if the buffer is in global scope.
@@ -109,13 +83,16 @@ struct GlobalMemChecker : public StmtExprVisitor {
       PrimExpr index = indices[i];
       PrimExpr shape_dim = buffer->shape[i];
 
-      bool has_variable = false;
+      bool is_index_constant = true;
       PostOrderVisit(index, [&](const ObjectRef &obj) {
         if (const VarNode *v = obj.as<VarNode>()) {
-          has_variable = true;
+          is_index_constant = false;
+        }
+        if (const BufferLoadNode *v = obj.as<BufferLoadNode>()) {
+          is_index_constant = false;
         }
       });
-      if (!has_variable) {
+      if (is_index_constant) {
         // If index is a constant, we can skip the check
         continue;
       }
@@ -145,18 +122,31 @@ private:
   bool recursively_collect_conds_;
 };
 
-class SafeMemorysRewriter : public StmtExprMutator {
-  arith::Analyzer *analyzer_;
-
+class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
 public:
-  explicit SafeMemorysRewriter(Map<Buffer, PrimExpr> annotated_safe_value_map,
-                               arith::Analyzer *analyzer)
-      : annotated_safe_value_map_(std::move(annotated_safe_value_map)),
-        analyzer_(analyzer) {}
+  // Static method to substitute and transform the given PrimFunc
+  static PrimFunc Substitute(PrimFunc f) {
+    arith::Analyzer analyzer;
+    // Create an instance of the legalizer with the analyzer
+    SafeMemorysRewriter substituter(&analyzer);
+    // Get a mutable copy of the function node
+    PrimFuncNode *fptr = f.CopyOnWrite();
+    for (const auto &[_, buffer] : f->buffer_map) {
+      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    // Apply the legalizer to the function body
+    fptr->body = substituter.VisitStmt(f->body);
+    return f;
+  }
 
 private:
+  // Constructor initializing the base class with the analyzer
+  SafeMemorysRewriter(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
+  // Constructor initializing the base class with the analyzer
+
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    auto load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    auto load = Downcast<BufferLoad>(IRMutatorWithAnalyzer::VisitExpr_(op));
 
     // For Load/Store, we only check the current node, not its children.
     // Since rewriter will recursively visit children.
@@ -181,7 +171,7 @@ private:
 
   Stmt VisitStmt_(const BufferStoreNode *op) final {
     // Check if the buffer is in global scope
-    auto store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    auto store = Downcast<BufferStore>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
     GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
     checker(store);
@@ -253,6 +243,25 @@ private:
     return evaluate;
   }
 
+  Stmt VisitStmt_(const BlockNode *op) final {
+    for (auto buffer : op->alloc_buffers) {
+      buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    if (op->annotations.count(attr::kSafeValueMap)) {
+      auto map = op->annotations.Get(attr::kSafeValueMap)
+                     ->as<Map<Var, PrimExpr>>()
+                     .value();
+      for (const auto &[var, safe_value] : map) {
+        ICHECK(buffer_data_to_buffer_.count(var))
+            << "buffer " << var << " is not found in the block "
+            << buffer_data_to_buffer_;
+        auto buffer = buffer_data_to_buffer_[var];
+        annotated_safe_value_map_.Set(buffer, safe_value);
+      }
+    }
+    return IRMutatorWithAnalyzer::VisitStmt_(op);
+  }
+
   bool IsLocalBuffer(const Buffer &buffer) {
     String scope = buffer.scope();
     return scope == "local" || scope == "local.fragment" ||
@@ -276,87 +285,6 @@ private:
     return make_zero(buffer->dtype);
   }
 
-  Map<Buffer, PrimExpr> annotated_safe_value_map_;
-};
-
-// Class to legalize safe memory access by transforming them appropriately
-class SafeMemoryLegalizer : IRMutatorWithAnalyzer {
-public:
-  // Static method to substitute and transform the given PrimFunc
-  static PrimFunc Substitute(PrimFunc f) {
-    arith::Analyzer analyzer;
-    // Create an instance of the legalizer with the analyzer
-    SafeMemoryLegalizer substituter(&analyzer);
-    // Get a mutable copy of the function node
-    PrimFuncNode *fptr = f.CopyOnWrite();
-    for (const auto &[_, buffer] : f->buffer_map) {
-      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
-    }
-    // Apply the legalizer to the function body
-    fptr->body = substituter.VisitStmt(f->body);
-    return f;
-  }
-
-private:
-  // Constructor initializing the base class with the analyzer
-  SafeMemoryLegalizer(arith::Analyzer *analyzer)
-      : arith::IRMutatorWithAnalyzer(analyzer) {}
-
-  // Override the VisitStmt_ method to handle ForNode (loop statements)
-  Stmt VisitStmt_(const ForNode *op) final {
-    // Visit and potentially modify the loop node
-    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    auto has_inner_loop = HasInnerLoop(for_node->body);
-    if (!has_inner_loop) {
-      SafeMemorysRewriter rewriter(annotated_safe_value_map_, analyzer_);
-      for_node.CopyOnWrite()->body = rewriter(for_node->body);
-      // // Detect Buffer Load Node in the loop body, collect the indices and
-      // buffer size
-
-      // // Run the checker on the loop body
-      // GlobalMemChecker checker(analyzer_);
-      // checker(for_node->body);
-      // Array<PrimExpr> conditions = checker.GetConditions();
-      // auto body = for_node->body;
-      // // Note that we might have duplicate conditions
-      // // Which will be optimized by simplify pass
-      // // Replace the loop body with the new body
-      // for (auto cond : conditions) {
-      //   body = IfThenElse(cond, body);
-      // }
-      // for_node.CopyOnWrite()->body = body;
-      return std::move(for_node);
-    }
-
-    // Visit a For Node
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const BlockNode *op) final {
-    for (auto buffer : op->alloc_buffers) {
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
-    }
-    if (op->annotations.count(attr::kSafeValueMap)) {
-      auto map = op->annotations.Get(attr::kSafeValueMap)
-                     ->as<Map<Var, PrimExpr>>()
-                     .value();
-      for (const auto &[var, safe_value] : map) {
-        ICHECK(buffer_data_to_buffer_.count(var))
-            << "buffer " << var << " is not found in the block "
-            << buffer_data_to_buffer_;
-        auto buffer = buffer_data_to_buffer_[var];
-        annotated_safe_value_map_.Set(buffer, safe_value);
-      }
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  static bool HasInnerLoop(const Stmt &stmt) {
-    LeafForFinder finder;
-    finder(stmt);
-    return !finder.leaf_for_nodes.empty();
-  }
-
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Buffer, PrimExpr> annotated_safe_value_map_;
 };
@@ -371,7 +299,7 @@ tvm::transform::Pass LegalizeSafeMemoryAccess() {
     if (disable_safe_memory_legalize) {
       return f;
     }
-    return SafeMemoryLegalizer::Substitute(std::move(f));
+    return SafeMemorysRewriter::Substitute(std::move(f));
   };
   // Create and return a PrimFunc pass with the transformation function
   return CreatePrimFuncPass(pass_func, 0, "tl.LegalizeSafeMemoryAccess", {});
diff --git a/src/transform/legalize_vectorized_loop.cc b/src/transform/legalize_vectorized_loop.cc
index aa461784a4256e5df75006396b8072e3cfe6e2d0..4fd4ab91f6baf1fceb67286f9c66c48f2803c424 100644
--- a/src/transform/legalize_vectorized_loop.cc
+++ b/src/transform/legalize_vectorized_loop.cc
@@ -73,7 +73,7 @@ private:
     // Change the loop kind from vectorized to serial
     for_node.CopyOnWrite()->kind = ForKind::kSerial;
     // Apply vectorization transformation to the loop
-    return VectorizeLoop(for_node);
+    return VectorizeLoop(for_node, analyzer_);
   }
 };
 
diff --git a/src/transform/loop_partition.cc b/src/transform/loop_partition.cc
index fe1fe03660a3ca465aef5b4f70fe473bfc6a4df8..b4236c6dbcf5b70933367e0afb5f6d7dd535d788 100644
--- a/src/transform/loop_partition.cc
+++ b/src/transform/loop_partition.cc
@@ -93,7 +93,8 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   }
   for (int i = 0; i < old_loop_depth; i++) {
     const ForNode *loop = body.as<ForNode>();
-    ICHECK(loop != nullptr);
+    ICHECK(loop != nullptr)
+        << "No extra statements are allowed between nested parallel loops.";
     vmap.Set(loop->loop_var, indices[i]);
     loop_mins.push_back(loop->min);
     loop_extents.push_back(loop->extent);
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 45283d9055baf614e2e56f34665fe782e8dce188..7a446731f173854eaa1d661a7621699be883ef81 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -45,7 +45,7 @@ struct VectorizePlanResult {
   PrimExpr condition;
 };
 
-class VectorizeFindGlobalAccess : public arith::IRVisitorWithAnalyzer {
+class VectorizeFindGlobalAccess : public StmtExprVisitor {
 public:
   VectorizeFindGlobalAccess() = default;
 
@@ -60,19 +60,20 @@ private:
   void VisitStmt_(const BufferStoreNode *node) final {
     if (node->buffer.scope() == "global")
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return StmtExprVisitor::VisitStmt_(node);
   }
 
   void VisitExpr_(const BufferLoadNode *node) final {
     if (node->buffer.scope() == "global")
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return StmtExprVisitor::VisitExpr_(node);
   }
 };
 
-class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
+class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
 public:
-  VectorizePlanner() = default;
+  explicit VectorizePlanner(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
 
   int Plan(const For &node) {
     tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
@@ -92,21 +93,31 @@ public:
   }
 
 private:
-  void VisitStmt_(const ForNode *node) final {
+  Stmt VisitStmt_(const ForNode *node) final {
     inner_for_ = node;
-    auto extent_ptr = as_const_int(analyzer_.Simplify(node->extent));
-    // Here I disable dynamic shape completely,
-    //   In order to do it, the Planner should accept an analyzer with
-    //   arithmetic info outside to prove the dividiblity of vector size
-    if (!extent_ptr) {
-      vector_size_ = 1;
-      return;
+    bool contains_nested_for = false;
+    // Must analysis vectorization on the innermost loop
+    PostOrderVisit(Downcast<Stmt>(node->body), [&](const ObjectRef &obj) {
+      if (obj.as<ForNode>()) {
+        contains_nested_for = true;
+      }
+    });
+
+    if (!contains_nested_for) {
+      auto extent_ptr = as_const_int(analyzer_->Simplify(node->extent));
+      // Here I disable dynamic shape completely,
+      //   In order to do it, the Planner should accept an analyzer with
+      //   arithmetic info outside to prove the dividiblity of vector size
+      if (!extent_ptr) {
+        vector_size_ = 1;
+        return ffi::GetRef<Stmt>(node);
+      }
+      vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
     }
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
-    arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const BufferLoadNode *node) final {
+  PrimExpr VisitExpr_(const BufferLoadNode *node) final {
     if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
         node->buffer.scope() == "shared.dyn")
       has_nonlocal_memory_access_ = true;
@@ -115,43 +126,48 @@ private:
       // constant buffer that tl hack to use as local register.
       auto boundary_check = node->buffer->shape[0].as<IntImmNode>();
       if (boundary_check && boundary_check->value == 1) {
-        return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+        return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
       }
     }
     UpdateVectorSize(node->indices, node->buffer);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
-  void VisitStmt_(const BufferStoreNode *node) final {
+  Stmt VisitStmt_(const BufferStoreNode *node) final {
     if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
         node->buffer.scope() == "shared.dyn")
       has_nonlocal_memory_access_ = true;
     UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitExpr(node->value);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitStmt_(const IfThenElseNode *node) final {
+  Stmt VisitStmt_(const IfThenElseNode *node) final {
     CheckConditionVectorized(node->condition);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const CallNode *node) final {
+  PrimExpr VisitExpr_(const CallNode *node) final {
     if (node->op == builtin::if_then_else()) {
       CheckConditionVectorized(node->args[0]);
     } else if (node->op == builtin::call_extern()) {
       // do not vectorize extern calls
       vector_size_ = 1;
+    } else if (node->op.same_as(tl::rng_rand()) ||
+               node->op.same_as(tl::rng_init())) {
+      // do not vectorize random operation
+      vector_size_ = 1;
     }
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   void CheckConditionVectorized(const PrimExpr &cond) {
     // TODO: perform some checks here
   }
 
-  void VisitExpr_(const CastNode *node) final {
+  PrimExpr VisitExpr_(const CastNode *node) final {
     vector_size_ = arith::ZeroAwareGCD(
         vector_load_bits_max_ / node->dtype.bits(), vector_size_);
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer) {
@@ -171,19 +187,21 @@ private:
     for (int i = 0; i < indices.size(); ++i) {
       elem_offset += indices[i] * strides[i];
     }
-
     // 2. If element offset is independent with loop_var, ignore it
-    if (CanProveIndependent(elem_offset, inner_for_->loop_var, &analyzer_)) {
+    if (CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_)) {
       return;
     }
-
-    // 3. Tight vectorize bound
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, vector_load_bits_max_ /
-                                                         buffer->dtype.bits());
-
+    // 3. Check if current vector_size_ works with invariant boundary check
+    if (!IsExprInvariantInVectorBoundary(elem_offset, inner_for_->loop_var,
+                                         vector_size_, analyzer_)) {
+      // If not, tight vectorize bound with buffer dtype constraint
+      vector_size_ = arith::ZeroAwareGCD(
+          vector_size_, vector_load_bits_max_ /
+                            (buffer->dtype.bits() * buffer->dtype.lanes()));
+    }
     // 4. Try to vectorize buffer load
     while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
-                               inner_for_->extent, vector_size_, &analyzer_)) {
+                               inner_for_->extent, vector_size_, analyzer_)) {
       vector_size_ /= 2;
     }
   }
@@ -223,7 +241,8 @@ private:
         Stmt body = Substitute(fnode->body, vmap);
         body = For(inner_var, 0, vector_size_, ForKind::kVectorized, body);
         body = For(outer_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
+                   fnode->thread_binding, fnode->annotations, fnode->step,
+                   fnode->span);
         return body;
       }
     } else {
@@ -235,7 +254,14 @@ private:
   const int vector_size_;
 };
 
-int GetVectorizeSize(const For &loop) { return VectorizePlanner().Plan(loop); }
+int GetVectorizeSize(const For &loop) {
+  arith::Analyzer analyzer;
+  return VectorizePlanner(&analyzer).Plan(loop);
+}
+
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer) {
+  return VectorizePlanner(analyzer).Plan(loop);
+}
 
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer) {
@@ -255,6 +281,28 @@ bool CanProveIndependent(const PrimExpr &expr, Var var,
   return false;
 }
 
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer) {
+  // Check if expr is invariant within vector boundaries
+  // We're trying to prove the access expression A[f(var)] depends only on
+  // floor(var/vecsize), not on var%vecsize
+  // Mathematically:
+  // \forall var, f(floor(var/vecsize)*vecsize + var%vecsize) ==
+  // f(floor(var/vecsize)*vecsize + 0)
+  // Example: for i in T.vectorized(8):
+  //     A[i] = B[i] * C[i//4]
+  // if vecsize=4, f(i)=i//4 depends only on i//4
+  // Therefore A[i] = B[i] * C[i//4] can be vectorized with vecsize=4
+  PrimExpr var_aligned =
+      floordiv(var, target_vectorized_size) * target_vectorized_size;
+  PrimExpr expr_aligned = Substitute(expr, {{var, var_aligned}});
+  if (analyzer->CanProveEqual(expr, expr_aligned)) {
+    return true;
+  }
+  return false;
+}
+
 bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                         const PrimExpr &iter_var_size,
                         int target_vectorized_size, arith::Analyzer *analyzer) {
@@ -275,9 +323,15 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                                0))
     return false;
 
+  if (IsExprInvariantInVectorBoundary(expr, var, target_vectorized_size,
+                                      analyzer)) {
+    return true;
+  }
+
+  auto simplified_expr = analyzer->Simplify(Substitute(expr, {{var, zero}}));
   // The base offset must be divisible
-  if (!analyzer->CanProveEqual(
-          FloorMod(Substitute(expr, {{var, zero}}), target_size_for_expr), 0)) {
+  if (!analyzer->CanProveEqual(FloorMod(simplified_expr, target_size_for_expr),
+                               zero)) {
     return false;
   }
 
@@ -308,7 +362,20 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
 
 For VectorizeLoop(const For &loop, int vectorize_hint) {
   if (vectorize_hint <= 0) {
-    VectorizePlanner planner;
+    arith::Analyzer analyzer;
+    VectorizePlanner planner(&analyzer);
+    vectorize_hint = planner.Plan(loop);
+  }
+  if (vectorize_hint == 1)
+    return loop;
+  auto rewriter = VectorizeRewriter(vectorize_hint);
+  return Downcast<For>(rewriter(loop));
+}
+
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  int vectorize_hint) {
+  if (vectorize_hint <= 0) {
+    VectorizePlanner planner(analyzer);
     vectorize_hint = planner.Plan(loop);
   }
   if (vectorize_hint == 1)
diff --git a/src/transform/loop_vectorize.h b/src/transform/loop_vectorize.h
index 4ab20c66824dcf1cbb3df9bdb7a1f7dbc744a881..92a756228dc1e3474328ba5d6188bc70e37b639e 100644
--- a/src/transform/loop_vectorize.h
+++ b/src/transform/loop_vectorize.h
@@ -35,12 +35,23 @@ using namespace tir;
 
 int GetVectorizeSize(const For &loop);
 
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer);
+
 For VectorizeLoop(const For &loop, int vectorize_hint = -1);
 
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  int vectorize_hint = -1);
+
 // Can prove expr is independent with var, i.e. the value of expr doesn't change
 // when var changes
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer);
+
+// Check if expr is invariant within vector boundaries
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer);
+
 bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                         const PrimExpr &iter_var_size,
                         int target_vectorized_size, arith::Analyzer *analyzer);
diff --git a/src/transform/lower_hopper_intrin.cc b/src/transform/lower_hopper_intrin.cc
index b082a574ec330c621a0d7c910d6afbccae333dc8..e9c848ac91a54294ad64e482801583b76d792b50 100644
--- a/src/transform/lower_hopper_intrin.cc
+++ b/src/transform/lower_hopper_intrin.cc
@@ -26,10 +26,13 @@ public:
     LowerHopperIntrin substituter(disable_shuffle_elect);
     fptr->body = substituter.VisitStmt(f->body);
     Map<Var, Array<PrimExpr>> init_desc_arg_map;
+    // Collect prologue/epilogue statements for host-side setup/teardown
+    Array<Stmt> prologue_stmts;
+    Array<Stmt> epilogue_stmts;
     for (const auto &[call, var] : substituter.desc_map_) {
       // Should allocate 128 bytes for TensorMap on stack
       Call alloc_desc = Call(DataType::Handle(), builtin::tvm_stack_alloca(),
-                             {StringImm("arg_value"), 16});
+                             {StringImm("tvm_ffi_any"), 16});
       Array<PrimExpr> init_desc_args;
       if (call->op.same_as(create_tma_descriptor())) {
         init_desc_args.push_back(StringImm(tvm_tensormap_create_tiled));
@@ -44,11 +47,66 @@ public:
       // add to function attribute
       Call init_desc =
           Call(DataType::Handle(), builtin::tvm_call_packed(), init_desc_args);
-      fptr->body =
-          LetStmt(var, alloc_desc, SeqStmt({Evaluate(init_desc), fptr->body}));
+      // Accumulate TMA descriptor init into prologue
+      prologue_stmts.push_back(LetStmt(var, alloc_desc, Evaluate(init_desc)));
       init_desc_arg_map.Set(var, init_desc_args);
     }
     f = WithAttr(std::move(f), "tma_descriptor_args", init_desc_arg_map);
+
+    // Additionally, if L2 persistent cache annotations were lowered earlier,
+    // materialize TVM FFI calls to set the stream access policy window.
+    if (f->attrs.defined() && f->attrs->dict.count("l2_persistent_map")) {
+      auto l2_map =
+          f->GetAttr<Map<String, Array<PrimExpr>>>("l2_persistent_map");
+      if (l2_map.defined()) {
+        // Build a lookup from buffer name to Buffer object
+        std::unordered_map<std::string, Buffer> name2buf;
+        for (const auto &kv : f->buffer_map) {
+          name2buf.emplace(kv.second->name, kv.second);
+        }
+        for (const auto &kv : l2_map.value()) {
+          const std::string buf_name = kv.first;
+          const Array<PrimExpr> &args = kv.second;
+          if (name2buf.count(buf_name) == 0) {
+            continue;
+          }
+          const Buffer &buf = name2buf.at(buf_name);
+          // Build base pointer expression (read access)
+          PrimExpr base_ptr = buf.access_ptr(1);
+          // Args packed: func_name, base_ptr, num_bytes, hit_ratio
+          Array<PrimExpr> packed_args;
+          packed_args.push_back(
+              StringImm(tvm_cuda_stream_set_access_policy_window));
+          packed_args.push_back(base_ptr);
+          // size_in_bytes (args[1]) then hit_ratio (args[0])
+          ICHECK_GE(args.size(), 2);
+          packed_args.push_back(args[1]);
+          packed_args.push_back(args[0]);
+          prologue_stmts.push_back(Evaluate(Call(
+              DataType::Int(32), builtin::tvm_call_packed(), packed_args)));
+        }
+        // Add a single epilogue call to reset the access policy window and
+        // restore L2 limit
+        Array<PrimExpr> reset_args;
+        reset_args.push_back(
+            StringImm(tvm_cuda_stream_reset_access_policy_window));
+        epilogue_stmts.push_back(Evaluate(
+            Call(DataType::Int(32), builtin::tvm_call_packed(), reset_args)));
+      }
+    }
+
+    // Stitch prologue statements before the original body
+    if (!prologue_stmts.empty()) {
+      // Chain the Let/Evaluate statements sequentially
+      Stmt seq = prologue_stmts.size() == 1 ? prologue_stmts[0]
+                                            : SeqStmt(prologue_stmts);
+      fptr->body = SeqStmt({seq, fptr->body});
+    }
+    if (!epilogue_stmts.empty()) {
+      Stmt seq_end = epilogue_stmts.size() == 1 ? epilogue_stmts[0]
+                                                : SeqStmt(epilogue_stmts);
+      fptr->body = SeqStmt({fptr->body, seq_end});
+    }
     return f;
   }
 
diff --git a/src/transform/lower_intrin.cc b/src/transform/lower_intrin.cc
index edd0e1a18a8a14d71f85033377c6a523ccefc432..cf312264d7dc7a25cb515e5c7f5c91a0d475b797 100644
--- a/src/transform/lower_intrin.cc
+++ b/src/transform/lower_intrin.cc
@@ -122,8 +122,14 @@ public:
         return truncdiv(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floordiv
       // in terms of truncdiv using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -165,6 +171,7 @@ public:
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncdiv(offset_numerator, op->b) - ceildiv;
       }
+      */
 
       DLOG(INFO) << "LowerFloorDiv: Cannot decide the sign of divident";
       PrimExpr rdiv = truncdiv(op->a, op->b);
@@ -223,8 +230,14 @@ public:
         return truncmod(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floormod
       // in terms of truncmod using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -265,6 +278,7 @@ public:
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncmod(offset_numerator, op->b);
       }
+      */
 
       DLOG(INFO) << "LowerFloorMod: Cannot decide the sign of divident";
       // NOTE:condition on b >= 0.
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
index 4c0ccfafeef45a1be53b9c1fcac3605932f86557..c88e05c56fc0631c760964d0a31bbafc6cc6bdad 100644
--- a/src/transform/lower_tile_op.cc
+++ b/src/transform/lower_tile_op.cc
@@ -606,8 +606,7 @@ private:
     if (call && call->op.as<GlobalVarNode>())
       return Downcast<Evaluate>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    auto tile_op =
-        ParseOperator(tvm::ffi::GetRef<Stmt>(op), buffer_data_to_buffer_);
+    auto tile_op = ParseOperator(tvm::ffi::GetRef<Stmt>(op));
     if (!tile_op.defined())
       return IRMutatorWithAnalyzer::VisitStmt_(op);
     AddWorkspaceCallback callback = [this](int num_elem, DataType dtype) {
@@ -639,10 +638,16 @@ private:
       thread_bounds = Range::FromMinExtent(0, 1);
     }
 
-    auto lowered =
-        tile_op->Lower(LowerArgs{target_, thread_bounds, thread_var_->var,
-                                 callback, layout_map_, buffer_remap_},
-                       analyzer_);
+    // Convert let_bindings_ to Map<Var, PrimExpr> for LowerArgs
+    Map<Var, PrimExpr> let_var_to_expr;
+    for (const auto &[var, expr] : let_bindings_) {
+      let_var_to_expr.Set(var, expr);
+    }
+
+    auto lowered = tile_op->Lower(
+        LowerArgs{target_, thread_bounds, thread_var_->var, callback,
+                  layout_map_, buffer_remap_, let_var_to_expr},
+        analyzer_);
     return IRMutatorWithAnalyzer::VisitStmt(lowered);
   }
 
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index 545d2403c16f259f848233a4950027c5bfd3d1a0..e9e8f76e6163709777bf930dfcf1921ba45b6829 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -20,6 +20,7 @@
 /*!
  * \file make_packed_api.cc Lower PrimFunc to use the packed function API.
  */
+#include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
@@ -32,24 +33,24 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "../op/builtin.h"
 #include "arg_binder.h"
+#include "merge_if_stmt.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace tl {
 using namespace tir;
 using namespace ffi;
-static constexpr const char *kDeviceContextVar = "device_api_context";
 
 namespace {
 class ReturnRewriter : public StmtMutator {
 public:
-  explicit ReturnRewriter(Var ret_var, Var ret_tcode)
-      : ret_var_(std::move(ret_var)), ret_tcode_(std::move(ret_tcode)) {}
+  explicit ReturnRewriter(Var ret_var) : ret_var_(ret_var) {}
 
   Stmt VisitStmt_(const ForNode *node) override {
     if (node->kind == ForKind::kParallel)
@@ -79,8 +80,6 @@ private:
   struct ConvertedInfo {
     int type_index{-1};
     PrimExpr expr;
-    Buffer dummy_val_buffer;
-    Buffer dummy_tcode_buffer;
   };
 
   ConvertedInfo ConvertForFFI(const PrimExpr &val) {
@@ -88,7 +87,11 @@ private:
 
     // convert val's data type to FFI data type, return type code
     DataType dtype = val.dtype();
-    if (dtype.is_int() || dtype.is_uint()) {
+    if (dtype.is_bool()) {
+      info.type_index = ffi::TypeIndex::kTVMFFIBool;
+      info.expr = Cast(DataType::Int(64), val);
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
       info.type_index = ffi::TypeIndex::kTVMFFIInt;
       info.expr = Cast(DataType::Int(64), val);
     } else if (dtype.is_float()) {
@@ -101,56 +104,39 @@ private:
       LOG(FATAL) << "data type " << dtype << " not supported yet";
     }
 
-    // If multiple return locations have the same data type, use the
-    // same dummy buffer declaration.
-    auto it = dummy_val_buffer_map_.find(info.type_index);
-    if (it != dummy_val_buffer_map_.end()) {
-      info.dummy_val_buffer = it->second;
-    } else {
-      info.dummy_val_buffer =
-          Buffer(ret_var_, info.expr.dtype(), {1}, {1}, ConstInt32(0),
-                 ret_var_->name_hint, 0, 0, kDefault);
-      dummy_val_buffer_map_[info.type_index] = info.dummy_val_buffer;
-    }
-
-    // The type_index is always a 32-bit int, so we don't need to have a
-    // separate map.
-    if (!dummy_tcode_buffer_.defined()) {
-      dummy_tcode_buffer_ =
-          Buffer(ret_tcode_, DataType::Int(32), {1}, {1}, ConstInt32(0),
-                 ret_tcode_->name_hint, 0, 0, kDefault);
-    }
-    info.dummy_tcode_buffer = dummy_tcode_buffer_;
-
     return info;
   }
 
-  Stmt WriteToOut(const PrimExpr &val) {
+  Stmt WriteToOut(PrimExpr val) {
     auto info = ConvertForFFI(val);
-    Stmt store_val = BufferStore(info.dummy_val_buffer, info.expr, {0});
-    Stmt store_tcode =
-        BufferStore(info.dummy_tcode_buffer, info.type_index, {0});
+    Stmt store_tindex = tir::Evaluate(
+        tir::Call(DataType::Int(32), tir::builtin::tvm_struct_set(),
+                  {ret_var_, IntImm(DataType::Int(32), 0),
+                   IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyTypeIndex),
+                   IntImm(DataType::Int(32), info.type_index)}));
+    Stmt store_zero_padding = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyZeroPadding),
+         IntImm(DataType::Int(32), 0)}));
+    Stmt store_val = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyUnionValue),
+         info.expr}));
     Stmt ret_zero = Evaluate(tvm::ret(0));
-    return SeqStmt({store_val, store_tcode, ret_zero});
+    return SeqStmt({store_tindex, store_zero_padding, store_val, ret_zero});
   }
 
   Var ret_var_;
-  Var ret_tcode_;
   int in_parallel_{0};
-
-  std::unordered_map<int, Buffer> dummy_val_buffer_map_;
-  Buffer dummy_tcode_buffer_;
 };
 
-Stmt RewriteReturn(Stmt body, Var ret_var, Var ret_tcode) {
-  ReturnRewriter rewriter(std::move(ret_var), std::move(ret_tcode));
-  return rewriter(std::move(body));
-}
-
 class SubroutineCallRewriter : public StmtExprMutator {
 public:
-  static Optional<Stmt> Apply(const Map<GlobalVar, String> &packed_func_methods,
-                              Stmt stmt) {
+  static ffi::Optional<Stmt>
+  Apply(const ffi::Map<GlobalVar, ffi::String> &packed_func_methods,
+        Stmt stmt) {
     SubroutineCallRewriter rewriter(packed_func_methods);
     stmt = rewriter.VisitStmt(stmt);
     if (rewriter.made_change_) {
@@ -162,16 +148,16 @@ public:
 
 private:
   explicit SubroutineCallRewriter(
-      const Map<GlobalVar, String> &packed_func_methods)
+      const ffi::Map<GlobalVar, ffi::String> &packed_func_methods)
       : packed_func_methods(packed_func_methods) {}
 
   PrimExpr VisitExpr_(const CallNode *op) override {
     auto node = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
 
     if (auto *gvar_ptr = node->op.as<GlobalVarNode>()) {
-      auto gvar = tvm::ffi::GetRef<GlobalVar>(gvar_ptr);
+      auto gvar = ffi::GetRef<GlobalVar>(gvar_ptr);
       if (auto symbol = packed_func_methods.Get(gvar)) {
-        Array<PrimExpr> cpacked_args;
+        ffi::Array<PrimExpr> cpacked_args;
         cpacked_args.push_back(tir::StringImm(symbol.value()));
         for (auto arg : node->args) {
           cpacked_args.push_back(arg);
@@ -187,19 +173,18 @@ private:
 
     return node;
   }
-  const Map<GlobalVar, String> &packed_func_methods;
+  const ffi::Map<GlobalVar, ffi::String> &packed_func_methods;
   bool made_change_{false};
 };
 
 } // namespace
 
-inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, const std::string &msg) {
-  return AssertStmt(std::move(lhs) == std::move(rhs), tvm::tir::StringImm(msg),
-                    Evaluate(0));
+inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
+  return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
-inline Stmt MakeAssertNotNull(PrimExpr ptr, const std::string &msg) {
-  Call isnull(DataType::Bool(), builtin::isnullptr(), {std::move(ptr)});
+inline Stmt MakeAssertNotNull(PrimExpr ptr, std::string msg) {
+  Call isnull(DataType::Bool(), builtin::isnullptr(), {ptr});
   return AssertStmt(!isnull, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
@@ -254,21 +239,16 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   }
 
   auto *func_ptr = func.CopyOnWrite();
+  // set the global symbol to the packed function name
   const Stmt nop = Evaluate(0);
   int num_args = static_cast<int>(func_ptr->params.size());
 
   // Data field definitions
   // The packed fields
+  Var v_self_handle("self_handle", DataType::Handle());
   Var v_packed_args("args", DataType::Handle());
-  Buffer buf_packed_arg_type_ids =
-      decl_buffer({IntImm(DataType::Int(32), func_ptr->params.size())},
-                  DataType::Int(32), "arg_type_ids");
   Var v_num_packed_args("num_args", DataType::Int(32));
-  Var v_out_ret_value("out_ret_value", PointerType(PrimType(DataType::Void())));
-  Var v_out_ret_tcode("out_ret_tcode",
-                      PointerType(PrimType(DataType::Int(32))));
-  Var v_resource_handle("resource_handle", DataType::Handle());
-  // The arguments of the function.
+  Var v_result("result", PointerType(PrimType(DataType::Void())));
 
   // The device context
   Var device_id("dev_id");
@@ -278,37 +258,24 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   std::vector<Stmt> seq_init, seq_check, arg_buffer_declarations;
   std::unordered_map<const VarNode *, PrimExpr> vmap;
   ArgBinder binder(&vmap);
-  std::vector<Stmt> shape_checks;
-  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-  bool disable_dynamic_tail_split =
-      ctxt->GetConfig<Bool>(kDisableDynamicTailSplit, Bool(true)).value();
 
   // ---------------------------
   // local function definitions
   // load i-th argument as type t
-  auto f_arg_value = [&](DataType t, int i) {
-    Array<PrimExpr> call_args{
+  auto f_load_arg_value = [&](DataType arg_type, int i) {
+    ffi::Array<PrimExpr> call_args{
         v_packed_args, IntImm(DataType::Int(32), i),
-        IntImm(DataType::Int(32), builtin::kTVMValueContent)};
+        IntImm(DataType::Int(32), builtin::kTVMFFIAnyUnionValue)};
     // load 64 bit version
-    DataType api_type = APIType(t);
+    DataType api_type = APIType(arg_type);
     PrimExpr res = Call(api_type, builtin::tvm_struct_get(), call_args);
     // cast to the target version.
-    if (api_type != t) {
-      res = Cast(t, res);
+    if (api_type != arg_type) {
+      res = Cast(arg_type, res);
     }
     return res;
   };
 
-  // Find the device API context argument based on name
-  for (const auto &param : func_ptr->params) {
-    if (param->name_hint == kDeviceContextVar) {
-      num_args--;
-      v_resource_handle = param;
-      break;
-    }
-  }
-
   // Assert correct type codes for each argument.  This must be done
   // *before* any initialization steps produced by
   // `binder.BindDLTensor()`.  The validity of those initialization
@@ -321,65 +288,219 @@ PrimFunc MakePackedAPI(PrimFunc func) {
         return error_message.str();
       }()));
 
-  seq_init.push_back(MakeAssertNotNull(
-      v_packed_args, name_hint + ": TVMValue* arg pointer was NULL"));
-  seq_init.push_back(MakeAssertNotNull(
-      buf_packed_arg_type_ids->data, name_hint + ": int* type_codes was NULL"));
-
-  seq_init.emplace_back(DeclBuffer(buf_packed_arg_type_ids, nop));
+  if (num_args > 0) {
+    seq_init.push_back(
+        MakeAssertNotNull(v_packed_args, name_hint + ": args pointer is NULL"));
+  }
 
   // Need to delay binding of the buffers, in case some arguments also
   // appear in the buffer.
   std::vector<std::pair<PrimExpr, Var>> var_def;
   std::vector<std::pair<Var, Buffer>> buffer_def;
 
-  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
-    Var param = func_ptr->params[i];
+  // First, collect a reverse map from Buffer->data var to parameter var so we
+  // can detect whether a buffer is actually used by the function body. In
+  // addition, collect variables that appear in the buffer's shape/stride so we
+  // can consider uses of those symbols as a use of the buffer itself.
+  std::unordered_map<const VarNode *, const VarNode *> data_var2param;
+  std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+      shape_var2params;
+  for (const auto &kv : func_ptr->buffer_map) {
+    const Var &param = kv.first;
+    const Buffer &buf = kv.second;
+    data_var2param[buf->data.get()] = param.get();
+    auto record_shape_vars = [&](const PrimExpr &e) {
+      PostOrderVisit(e, [&](const ObjectRef &n) {
+        if (const auto *v = n.as<VarNode>()) {
+          shape_var2params[v].push_back(param.get());
+        }
+      });
+    };
+    for (const PrimExpr &e : buf->shape)
+      record_shape_vars(e);
+    for (const PrimExpr &e : buf->strides)
+      record_shape_vars(e);
+    if (buf->elem_offset.defined())
+      record_shape_vars(buf->elem_offset);
+  }
 
-    // Ignore the device context argument, as it will still be passed
-    // as a native argument.
-    if (param->name_hint == kDeviceContextVar) {
-      continue;
+  // A visitor that records
+  //  - which parameter buffers are used via their data var (load/store/direct),
+  //  - which shape/stride/offset symbols are referenced in the body.
+  // Shape symbols are not immediately attributed to all carrier buffers here;
+  // a minimal carrier set is selected after visiting.
+  struct UsedBufferDetector : public StmtExprVisitor {
+    UsedBufferDetector(
+        const std::unordered_map<const VarNode *, const VarNode *> &data2param,
+        const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+            &shape2params)
+        : data2param(data2param), shape2params(shape2params) {}
+    void VisitExpr_(const VarNode *op) override {
+      auto it = data2param.find(op);
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      auto it2 = shape2params.find(op);
+      if (it2 != shape2params.end()) {
+        used_shape_vars.insert(op);
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    }
+    void VisitStmt_(const BufferStoreNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      StmtExprVisitor::VisitStmt_(op);
+    }
+    void VisitExpr_(const BufferLoadNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      StmtExprVisitor::VisitExpr_(op);
     }
 
-    var_def.emplace_back(f_arg_value(param.dtype(), i), param);
-    if (func_ptr->buffer_map.count(param)) {
-      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
+    const std::unordered_map<const VarNode *, const VarNode *> &data2param;
+    const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+        &shape2params;
+    std::unordered_set<const VarNode *> used_params_by_data;
+    std::unordered_set<const VarNode *> used_shape_vars;
+  };
+
+  UsedBufferDetector detector(data_var2param, shape_var2params);
+  detector(func_ptr->body);
+
+  // Build the packed argument handling. While doing so, keep track of whether
+  // each parameter buffer is actually used. Unused input buffers can be
+  // nullable and do not require DLTensor field dereferences.
+  //
+  // Start from buffers used via data-var (definitely non-NULL), then for each
+  // referenced shape symbol pick a minimal "carrier" buffer that provides the
+  // symbol. Prefer carriers that are already used-by-data; otherwise pick one
+  // arbitrary carrier to ensure the symbol is bound.
+  std::unordered_set<const VarNode *> used_param_buffers =
+      detector.used_params_by_data;
+  for (const VarNode *sym : detector.used_shape_vars) {
+    auto it = shape_var2params.find(sym);
+    if (it == shape_var2params.end())
+      continue;
+    const auto &carriers = it->second;
+    bool has_used_carrier = false;
+    for (const VarNode *p : carriers) {
+      if (used_param_buffers.count(p)) {
+        has_used_carrier = true;
+        break;
+      }
     }
+    // NOTE: With the new nullable shape binding logic in
+    // ArgBinder::BindDLTensors, we no longer need to force one carrier to be
+    // non-NULL. The binder will:
+    // 1. Assert that at least one carrier is non-NULL at runtime
+    // 2. Use cascaded if_then_else to read from the first non-NULL carrier
+    // So we can allow all carriers to be nullable.
+    // if (!has_used_carrier && !carriers.empty()) {
+    //   used_param_buffers.insert(carriers.front());
+    // }
+  }
 
-    // type code checks
-    Var type_index(param->name_hint + ".code", DataType::Int(32));
-    seq_init.emplace_back(LetStmt(
+  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
+    Var param = func_ptr->params[i];
+    PrimExpr arg_value;
+    // type index checks
+    Var type_index(param->name_hint + ".type_index", DataType::Int(32));
+    seq_init.push_back(LetStmt(
         type_index,
-        BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}),
+        tir::Call(DataType::Int(32), builtin::tvm_struct_get(),
+                  {v_packed_args, IntImm(DataType::Int(32), i),
+                   IntImm(DataType::Int(32), builtin::kTVMFFIAnyTypeIndex)}),
         nop));
-    DataType t = param.dtype();
-    if (t.is_handle()) {
+    DataType dtype = param.dtype();
+    if (dtype.is_handle()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be pointer";
+      // Prefer the Buffer name if available; otherwise, fall back to param name
+      // (trim _handle).
+      std::string display_name;
+      auto it_buf = func_ptr->buffer_map.find(param);
+      if (it_buf != func_ptr->buffer_map.end()) {
+        const auto &kv = *it_buf;
+        display_name = kv.second->data->name_hint;
+      } else {
+        display_name = param->name_hint;
+        const char *suffix = "_handle";
+        if (display_name.size() >= 7 &&
+            display_name.compare(display_name.size() - 7, 7, suffix) == 0) {
+          display_name.erase(display_name.size() - 7);
+        }
+      }
+      msg << "kernel " << name_hint << " input " << display_name
+          << " expected pointer or tensor handle";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFINone ||
                          type_index == ffi::TypeIndex::kTVMFFIOpaquePtr ||
                          type_index == ffi::TypeIndex::kTVMFFIDLTensorPtr ||
                          type_index >= ffi::TypeIndex::kTVMFFIStaticObjectBegin,
                      tvm::tir::StringImm(msg.str()), nop));
-    } else if (t.is_int() || t.is_uint()) {
+      // if type_index is Tensor, we need to add the offset of the DLTensor
+      // header which always equals 16 bytes, this ensures that T.handle always
+      // shows up as a DLTensor*
+      const int64_t object_cell_offset = sizeof(TVMFFIObject);
+      static_assert(object_cell_offset == 24);
+      arg_value = f_load_arg_value(param.dtype(), i);
+      PrimExpr handle_from_tensor =
+          Call(DataType::Handle(), tir::builtin::handle_add_byte_offset(),
+               {arg_value, IntImm(DataType::Int(32), object_cell_offset)});
+      arg_value = Select(type_index == ffi::TypeIndex::kTVMFFITensor,
+                         handle_from_tensor, arg_value);
+    } else if (dtype.is_bool()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be int";
-      seq_init.emplace_back(AssertStmt(type_index == kDLInt,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected boolean";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIBool ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value =
+          Cast(DataType::Bool(), f_load_arg_value(DataType::Int(64), i));
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
+      std::ostringstream msg;
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected integer";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value = f_load_arg_value(param.dtype(), i);
     } else {
-      ICHECK(t.is_float());
+      ICHECK(dtype.is_float());
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be float";
-      seq_init.emplace_back(AssertStmt(type_index == kDLFloat,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected float";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIFloat ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      // use select so we can also handle int conversion to bool
+      arg_value = tir::Select(
+          type_index == ffi::TypeIndex::kTVMFFIFloat,
+          /* true_value = */ f_load_arg_value(param.dtype(), i),
+          /* false_value = */
+          Cast(param.dtype(), f_load_arg_value(DataType::Int(64), i)));
+    }
+    var_def.emplace_back(arg_value, param);
+    if (func_ptr->buffer_map.count(param)) {
+      // buffer binding now depends on type index
+      // if the index is Tensor handle, we need to offset to get the DLTensor*
+      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
     }
   }
 
-  Array<Var> args{v_packed_args,     buf_packed_arg_type_ids->data,
-                  v_num_packed_args, v_out_ret_value,
-                  v_out_ret_tcode,   v_resource_handle};
+  // signature: (void* handle, TVMFFIAny* packed_args, int num_args, TVMFFIAny*
+  // v_result)
+  ffi::Array<Var> args{v_self_handle, v_packed_args, v_num_packed_args,
+                       v_result};
 
   // Arg definitions are defined before buffer binding to avoid the use before
   // def errors.
@@ -392,83 +513,59 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     binder.Bind(param, expr, name_hint + "." + param->name_hint, true);
   }
 
-  for (const auto &kv : buffer_def) {
-    binder.BindDLTensor(kv.second, device_type, device_id, kv.first,
-                        name_hint + "." + kv.first->name_hint);
-    arg_buffer_declarations.push_back(DeclBuffer(kv.second, nop));
+  binder.BindDLTensors(buffer_def, device_type, device_id, name_hint,
+                       used_param_buffers);
+  for (const auto &[var, buffer] : buffer_def) {
+    // Prefer buffer data var name in diagnostics to avoid exposing low-level
+    // handle vars
+    arg_buffer_declarations.push_back(DeclBuffer(buffer, nop));
   }
 
-  func =
-      WithAttrs(std::move(func),
-                {{tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc)},
-                 {tvm::attr::kTarget, target_host}});
-  Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode);
+  // reset global symbol to attach prefix
+  func = WithAttrs(
+      std::move(func),
+      {{tvm::attr::kCallingConv, static_cast<int>(CallingConv::kCPackedFunc)},
+       {tvm::attr::kTarget, target_host},
+       {tvm::attr::kGlobalSymbol,
+        ffi::symbol::tvm_ffi_symbol_prefix + global_symbol.value()}});
+
+  Stmt body = ReturnRewriter(v_result)(func_ptr->body);
   body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::compute_scope,
                   StringImm(name_hint + "_compute_"), body);
   // Set device context
   if (vmap.count(device_id.get())) {
-    auto node = String("default");
+    ffi::Any node = ffi::String("default");
     seq_check.push_back(AttrStmt(node, tir::attr::device_id, device_id, nop));
     seq_check.push_back(
         AttrStmt(node, tir::attr::device_type, device_type, nop));
 
     if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
-          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
+          Evaluate(Call(DataType::Int(32), tir::builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device),
                          device_type, device_id}));
       body = SeqStmt({set_device, body});
     }
   }
 
-  // (zhengju) For dynamic constraint, we need to check the buffer shape and
-  // dtype to make sure the buffer can be vectorized.
-  for (const auto &kv : buffer_def) {
-    if (disable_dynamic_tail_split) {
-      Optional<Integer> opt_dynamic_alignment =
-          ctxt->GetConfig(kDynamicAlignment, Optional<Integer>());
-      int dynamic_alignment = opt_dynamic_alignment.value_or(Integer(8))->value;
-      // The vectorize dimension will be the last dimension of the buffer
-      auto vectorize_dim = kv.second->shape[kv.second->shape.size() - 1];
-      auto shape_vectorize_expr = [&]() -> PrimExpr {
-        PrimExpr result = IntImm(kv.second->DefaultIndexType(), 1);
-        result = result * vectorize_dim;
-        result = FloorMod(result, IntImm(result->dtype, dynamic_alignment));
-        return result;
-      }();
-      shape_checks.emplace_back(AssertStmt(
-          shape_vectorize_expr == 0,
-          tvm::tir::StringImm(
-              kv.second->name +
-              ": Vectorize dimension in buffer must be divisible by " +
-              std::to_string(dynamic_alignment)),
-          nop));
-    }
-  }
-
   // Return error code of zero on success
   body = SeqStmt({body, Evaluate(ret(Integer(0)))});
 
-  if (!disable_dynamic_tail_split) {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations},
-                     body);
-  } else {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations, shape_checks},
-                     body);
-  }
-
+  body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
+                    arg_buffer_declarations},
+                   body);
   func_ptr->body = body;
   func_ptr->params = args;
 
-  Array<Var> undefined = UndefinedVars(func_ptr->body, func_ptr->params);
+  ffi::Array<Var> undefined = UndefinedVars(body, func_ptr->params);
+
   ICHECK_EQ(undefined.size(), 0)
       << "In PrimFunc " << name_hint << " variables " << undefined
       << " are used, but are not passed in as API arguments";
 
-  func_ptr->buffer_map = Map<Var, Buffer>();
-  func_ptr->ret_type = PrimType(DataType::Int(32)); // return the function.
+  func_ptr->buffer_map = ffi::Map<Var, Buffer>();
+  func_ptr->ret_type = PrimType(DataType::Int(32));
+  // return the function.
   return func;
 }
 
@@ -498,6 +595,7 @@ tvm::transform::Pass MakePackedAPI() {
           func.CopyOnWrite()->body = body.value();
         }
         func = MakePackedAPI(std::move(func));
+        func = MergeIfStmtSubstitute(func);
 
         if (!func.same_as(orig_func)) {
           updates->Add(gvar, func);
@@ -521,4 +619,4 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 } // namespace tl
-} // namespace tvm
+} // namespace tvm
\ No newline at end of file
diff --git a/src/transform/merge_if_stmt.cc b/src/transform/merge_if_stmt.cc
index 39ea3b0b7af5d94f2ddecac80175b2ede0e0f6ee..98d9d3ac22e0e63c214a0b7fe31a4e4776fa413c 100644
--- a/src/transform/merge_if_stmt.cc
+++ b/src/transform/merge_if_stmt.cc
@@ -3,6 +3,8 @@
  * \brief Merge the If Stmt in SeqStmt
  */
 
+#include "merge_if_stmt.h"
+
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
@@ -20,23 +22,46 @@ using namespace tir;
 class MergeIfStmtRewriter : public StmtExprMutator {
 public:
   static PrimFunc Substitute(PrimFunc &f) {
-    auto rewriter = MergeIfStmtRewriter();
-    f.CopyOnWrite()->body = rewriter(f->body);
+    f.CopyOnWrite()->body = MergeIfStmtRewriter::Apply(f->body);
     return f;
   }
 
+  static Stmt Apply(Stmt stmt) {
+    auto rewriter = MergeIfStmtRewriter();
+    return rewriter(stmt);
+  }
+
 private:
   MergeIfStmtRewriter() = default;
 
+  void FlattenAppend(const Stmt &s, Array<Stmt> *out) {
+    if (const auto *seq = s.as<SeqStmtNode>()) {
+      for (const Stmt &e : seq->seq) {
+        FlattenAppend(e, out);
+      }
+    } else {
+      out->push_back(s);
+    }
+  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
-    Array<Stmt> new_seq;
+    // First, recursively flatten nested SeqStmt so that
+    //   SeqStmt{ if, SeqStmt{ if, SeqStmt{ if } } }
+    // becomes a single-level sequence of [if, if, if].
+    Array<Stmt> flat_seq;
+    for (const Stmt &stmt : op->seq) {
+      Stmt new_stmt = this->VisitStmt(stmt);
+      FlattenAppend(new_stmt, &flat_seq);
+    }
 
+    // Then, merge consecutive IfThenElse (without else) that share the same
+    // condition.
+    Array<Stmt> new_seq;
     PrimExpr current_condition;
     Array<Stmt> current_if_bodies;
 
-    for (const Stmt &stmt : op->seq) {
-      Stmt new_stmt = this->VisitStmt(stmt);
-      if (const IfThenElseNode *if_node = new_stmt.as<IfThenElseNode>()) {
+    for (const Stmt &stmt : flat_seq) {
+      if (const auto *if_node = stmt.as<IfThenElseNode>()) {
         if (!if_node->else_case.defined()) {
           if (current_condition.defined() &&
               ExprDeepEqual()(current_condition, if_node->condition)) {
@@ -73,7 +98,7 @@ private:
         current_if_bodies.clear();
       }
 
-      new_seq.push_back(new_stmt);
+      new_seq.push_back(stmt);
     }
 
     if (!current_if_bodies.empty()) {
@@ -90,6 +115,12 @@ private:
   }
 };
 
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f) {
+  return MergeIfStmtRewriter::Substitute(f);
+}
+
+Stmt ApplyMergeIfStmt(Stmt stmt) { return MergeIfStmtRewriter::Apply(stmt); }
+
 using namespace tir::transform;
 tvm::transform::Pass MergeIfStmt() {
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
diff --git a/src/transform/merge_if_stmt.h b/src/transform/merge_if_stmt.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d7a282d1f7b33c18579da72c2a52cef9574a59d
--- /dev/null
+++ b/src/transform/merge_if_stmt.h
@@ -0,0 +1,52 @@
+/*!
+ * \file merge_if_stmt.h
+ * \brief Merge consecutive If statements with the same condition
+ */
+#ifndef TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+#define TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Forward declaration
+class MergeIfStmtRewriter;
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a PrimFunc
+ *
+ * This function merges consecutive IfThenElse statements that have the same
+ * condition into a single if statement with a SeqStmt body.
+ *
+ * Example:
+ *   if (cond) { stmt1 }
+ *   if (cond) { stmt2 }
+ *   if (cond) { stmt3 }
+ *
+ * Becomes:
+ *   if (cond) {
+ *     stmt1
+ *     stmt2
+ *     stmt3
+ *   }
+ *
+ * \param f The PrimFunc to transform
+ * \return Transformed PrimFunc with merged if statements
+ */
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f);
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a statement
+ * \param stmt The statement to transform
+ * \return Transformed statement with merged if statements
+ */
+Stmt ApplyMergeIfStmt(Stmt stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
diff --git a/src/transform/plan_update_buffer_allocation_location.cc b/src/transform/plan_update_buffer_allocation_location.cc
new file mode 100644
index 0000000000000000000000000000000000000000..995b21519de8945a6cac13370d4028405dbfb448
--- /dev/null
+++ b/src/transform/plan_update_buffer_allocation_location.cc
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Planning where buffers to be allocated and update the AST.
+ * \file plan_update_buffer_allocation_location.cc
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/var.h>
+
+#include "tir/transforms/ir_utils.h"
+
+// Forward-declare tir's var-level LCA helper which has no public header.
+namespace tvm {
+namespace tir {
+ffi::Map<Var, ffi::Optional<Stmt>>
+DetectBufferVarAccessLCA(const PrimFunc &func);
+}
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using namespace tir::transform;
+
+// Use TVM's tir analysis API for LCA detection.
+
+class CollectManagedAllocations : public StmtExprVisitor {
+public:
+  void VisitStmt_(const BlockNode *op) final {
+    for (const auto &buf : op->alloc_buffers) {
+      managed_allocations.insert(buf->data.get());
+    }
+    for (const auto &buf : op->match_buffers) {
+      managed_allocations.insert(buf->buffer->data.get());
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief Buffers that are allocated outside of the BlockNode, and should not
+   * be moved by BufferAllocationLocator. */
+  std::unordered_set<const VarNode *> managed_allocations;
+};
+
+/*! \brief Collect the allocate buffer order. */
+class BufferAllocateOrderCollector : public StmtExprVisitor {
+public:
+  static ffi::Array<Buffer> Collect(const PrimFunc &func) {
+    BufferAllocateOrderCollector collector;
+    for (const auto &kv : func->buffer_map) {
+      collector.buffer_alloc_recorder_.push_back(kv.second);
+    }
+    collector(func->body);
+    return std::move(collector.buffer_alloc_recorder_);
+  }
+
+private:
+  bool find(const Buffer &buf) {
+    return std::find(buffer_alloc_recorder_.begin(),
+                     buffer_alloc_recorder_.end(),
+                     buf) != buffer_alloc_recorder_.end();
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    for (const Buffer &buffer : op->alloc_buffers) {
+      buffer_alloc_recorder_.push_back(buffer);
+    }
+    // Also visit match_buffers to collect constant buffers associated with
+    // AllocateConst nodes. These buffers only appear in read and match_buffer
+    // regions.
+    for (const auto &region : op->match_buffers) {
+      if (!find(region->source->buffer)) {
+        buffer_alloc_recorder_.push_back(region->source->buffer);
+      }
+    }
+
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief The buffer allocated order recorder. */
+  ffi::Array<Buffer> buffer_alloc_recorder_;
+};
+
+class BufferAllocationLocator : public StmtExprMutator {
+public:
+  explicit BufferAllocationLocator(const PrimFunc &func) {
+    // Use TVM's tir LCA detection implementation
+    ffi::Map<Buffer, ffi::Optional<Stmt>> buffer_lca =
+        tir::DetectBufferAccessLCA(func);
+    ffi::Map<Var, ffi::Optional<Stmt>> var_lca =
+        tir::DetectBufferVarAccessLCA(func);
+
+    // The buffer_alloc_recorder Array is used to keep the buffer allocation
+    // order since the buffer_lca Map is unordered.
+    ffi::Array<Buffer> buffer_alloc_recorder =
+        BufferAllocateOrderCollector::Collect(func);
+    std::unordered_set<const VarNode *> arg_buffer_vars;
+    CollectManagedAllocations collector;
+    collector(func->body);
+    managed_allocations_ = collector.managed_allocations;
+
+    for (const auto &kv : func->buffer_map) {
+      const Buffer &buffer = kv.second;
+      arg_buffer_vars.emplace(buffer->data.get());
+      PushBinding(buffer->data, buffer);
+    }
+    // create buffers to be allocated at each stmts
+    for (const auto &buffer : buffer_alloc_recorder) {
+      // Prefer the LCA derived from the underlying data var. If missing, fall
+      // back to Buffer LCA.
+      const StmtNode *stmt = nullptr;
+      auto vit = var_lca.find(buffer->data);
+      if (vit != var_lca.end()) {
+        stmt = (*vit).second.get();
+      } else {
+        auto bit = buffer_lca.find(buffer);
+        if (bit != buffer_lca.end()) {
+          stmt = (*bit).second.get();
+        }
+      }
+      if (stmt != nullptr || vit != var_lca.end()) {
+        if (arg_buffer_vars.count(buffer->data.get())) {
+          continue;
+        }
+        if (managed_allocations_.count(buffer->data.get())) {
+          alloc_buffers_[stmt].push_back(buffer);
+        }
+        // Do not push binding here. Bindings should reflect scope accurately,
+        // and will be pushed/popped when visiting the owning stmt.
+      }
+    }
+  }
+
+private:
+  // Maintain a stack of Buffers per data var to correctly handle cases
+  // where multiple Buffer objects share the same underlying data Var.
+  void PushBinding(const Var &v, const Buffer &buf) {
+    ffi::Array<Buffer> arr;
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it != buffer_data_to_buffers_.end()) {
+      arr = (*it).second;
+    }
+    arr.push_back(buf);
+    buffer_data_to_buffers_.Set(v, arr);
+  }
+
+  void PopBinding(const Var &v) {
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it == buffer_data_to_buffers_.end())
+      return;
+    ffi::Array<Buffer> arr = (*it).second;
+    if (!arr.empty()) {
+      // erase last element
+      std::vector<Buffer> tmp;
+      tmp.reserve(arr.size() - 1);
+      for (size_t i = 0; i + 1 < arr.size(); ++i)
+        tmp.push_back(arr[i]);
+      arr = ffi::Array<Buffer>(tmp);
+    }
+    if (arr.empty()) {
+      buffer_data_to_buffers_.erase(v);
+    } else {
+      buffer_data_to_buffers_.Set(v, arr);
+    }
+  }
+
+  bool HasBinding(const Var &v) const {
+    auto it = buffer_data_to_buffers_.find(v);
+    return it != buffer_data_to_buffers_.end() && !(*it).second.empty();
+  }
+
+  // Snapshot the current top binding per Var for APIs that require
+  // a single Buffer per data Var (e.g. GetBlockReadWriteRegion).
+  ffi::Map<Var, Buffer> SnapshotVarMap() const {
+    ffi::Map<Var, Buffer> out;
+    for (const auto &kv : buffer_data_to_buffers_) {
+      const Var &v = kv.first;
+      const ffi::Array<Buffer> &arr = kv.second;
+      if (!arr.empty()) {
+        out.Set(v, arr[arr.size() - 1]);
+      }
+    }
+    return out;
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    auto it = alloc_buffers_.find(op);
+    if (it == alloc_buffers_.end()) {
+      return StmtMutator::VisitStmt_(op);
+    }
+    for (const Buffer &buf : it->second) {
+      PushBinding(buf->data, buf);
+    }
+    auto node = Downcast<For>(StmtMutator::VisitStmt_(op));
+    ffi::Array<Buffer> new_block_alloc_bufs;
+    for (const Buffer &buf : it->second) {
+      if (managed_allocations_.count(buf->data.get())) {
+        PopBinding(buf->data);
+        new_block_alloc_bufs.push_back(buf);
+      }
+    }
+
+    if (!new_block_alloc_bufs.empty()) {
+      node.CopyOnWrite()->body =
+          InjectOpaqueBlock(node->body, new_block_alloc_bufs);
+    }
+
+    return node;
+  }
+
+  Stmt VisitStmt_(const BlockNode *op) final {
+    ICHECK(!op->init.defined());
+    ffi::Array<Buffer> alloc_buffers;
+    auto it = alloc_buffers_.find(op);
+    if (it != alloc_buffers_.end()) {
+      alloc_buffers = it->second;
+      for (const Buffer &buf : it->second) {
+        PushBinding(buf->data, buf);
+      }
+    }
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      const Var &source_var = match_buffer->source->buffer->data;
+      ICHECK(HasBinding(source_var));
+      PushBinding(target_var, match_buffer->buffer);
+    }
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    op = stmt.as<BlockNode>();
+    ICHECK(op != nullptr);
+
+    // No longer consider buffers created by match_buffer inside the block when
+    // updating access region.
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      PopBinding(target_var);
+    }
+    // No longer consider buffers allocated inside the block when updating
+    // access region.
+    if (it != alloc_buffers_.end()) {
+      for (const Buffer &buf : it->second) {
+        PopBinding(buf->data);
+      }
+    }
+
+    ObjectPtr<BlockNode> n = CopyOnWrite(op);
+    n->alloc_buffers = std::move(alloc_buffers);
+    // Erase buffer allocated inside the block from access region.
+    n->reads = RemoveRedundantBufferRegion(n->reads);
+    n->writes = RemoveRedundantBufferRegion(n->writes);
+    return Stmt(n);
+  }
+
+  Stmt VisitStmt_(const BufferRealizeNode *op) final {
+    ICHECK(false)
+        << "Internal Error: BufferRealizeNode is not allowed in TensorIR.";
+    throw;
+  }
+
+  Stmt InjectOpaqueBlock(Stmt body, const ffi::Array<Buffer> &alloc_buffers) {
+    ICHECK(!alloc_buffers.empty());
+    Block opaque_block(/*iter_vars=*/{},
+                       /*reads=*/{},
+                       /*writes=*/{},
+                       /*name_hint=*/"",
+                       /*body=*/std::move(body),
+                       /*init=*/std::nullopt,
+                       /*alloc_buffers=*/alloc_buffers);
+    ObjectPtr<BlockNode> n = CopyOnWrite(opaque_block.get());
+    // Snapshot to a Var->Buffer map using the innermost binding for each Var.
+    ffi::Map<Var, Buffer> var_map = SnapshotVarMap();
+    ffi::Array<ffi::Array<BufferRegion>> access =
+        GetBlockReadWriteRegion(opaque_block, var_map);
+    n->reads = access[0];
+    n->writes = access[1];
+    BlockRealize realize({}, Bool(true), Block(n));
+    return realize;
+  }
+
+  ffi::Array<BufferRegion>
+  RemoveRedundantBufferRegion(const ffi::Array<BufferRegion> &region) const {
+    ffi::Array<BufferRegion> result;
+    for (const BufferRegion &buffer_region : region) {
+      if (HasBinding(buffer_region->buffer->data)) {
+        result.push_back(buffer_region);
+      }
+    }
+    return result;
+  }
+
+  /*! \brief The map from stmt to the buffers to be allocated under it. */
+  std::unordered_map<const StmtNode *, ffi::Array<Buffer>> alloc_buffers_;
+  /*! \brief Stack of buffers per data var for scoping correctness. */
+  ffi::Map<Var, ffi::Array<Buffer>> buffer_data_to_buffers_;
+  /*! \brief Buffers that are allocated within a BlockNode, and may be moved. */
+  std::unordered_set<const VarNode *> managed_allocations_;
+};
+
+PrimFunc PlanAndUpdateBufferAllocationLocation(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  BufferAllocationLocator locator(func);
+  fptr->body = locator(fptr->body);
+  return func;
+}
+
+namespace transform {
+
+Pass PlanAndUpdateBufferAllocationLocation() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return ::tvm::tl::PlanAndUpdateBufferAllocationLocation(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0,
+                            "tl.PlanAndUpdateBufferAllocationLocation", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.PlanAndUpdateBufferAllocationLocation",
+                        PlanAndUpdateBufferAllocationLocation);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/simplify.cc b/src/transform/simplify.cc
index d64c7016dfeac8929644a1367d9c1cf559457b1b..c10d5687a7aecf3ce698a53d80cf6fd21e314e1f 100644
--- a/src/transform/simplify.cc
+++ b/src/transform/simplify.cc
@@ -240,37 +240,42 @@ public:
     simplifier.MarkBufferMapShapes(func);
     func.CopyOnWrite()->body = simplifier(func->body);
 
-    // Begin to remove useless var and buffer
-    // First get used buffers
-    simplifier.used_buffers_ = CollectUsedBuffers(func);
-
-    bool param_updated = false;
-    Array<Var> new_params;
-    Map<Var, Buffer> new_buffer_map;
-    // Check whether each buffer is used
-    for (const auto &var : func->params) {
-      if (func->buffer_map.find(var) != func->buffer_map.end()) {
-        if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
-            simplifier.used_buffers_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
-        } else if (simplifier.used_in_buffer_def_.find(
-                       func->buffer_map[var]->data.get()) !=
-                   simplifier.used_in_buffer_def_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
+    // Optionally remove unused buffer parameters
+    if (simplify_arguments) {
+      // First get used buffers
+      simplifier.used_buffers_ = CollectUsedBuffers(func);
+
+      bool param_updated = false;
+      Array<Var> new_params;
+      Map<Var, Buffer> new_buffer_map;
+      // Check whether each buffer is used
+      for (const auto &var : func->params) {
+        if (func->buffer_map.find(var) != func->buffer_map.end()) {
+          if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
+              simplifier.used_buffers_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else if (simplifier.used_in_buffer_def_.find(
+                         func->buffer_map[var]->data.get()) !=
+                     simplifier.used_in_buffer_def_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else {
+            param_updated = true;
+          }
         } else {
-          param_updated = true;
+          // Non-buffer parameters (e.g., scalars) are always retained
+          new_params.push_back(var);
         }
       }
-    }
 
-    if (param_updated) {
-      return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
-                      new_buffer_map, func->attrs, func->span);
-    } else {
-      return func;
+      if (param_updated) {
+        return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
+                        new_buffer_map, func->attrs, func->span);
+      }
     }
+    // Either no change to params or argument simplification disabled
+    return func;
   }
 
 private:
@@ -460,6 +465,16 @@ private:
     return std::move(store);
   }
 
+  Stmt VisitStmt_(const AttrStmtNode *op) override {
+    if (op->attr_key == "tl.assume") {
+      PrimExpr condition = this->VisitExpr(Downcast<PrimExpr>(op->node));
+      auto n = CopyOnWrite(op);
+      n->node = std::move(condition);
+      return Parent::VisitStmt_(n.get());
+    }
+    return Parent::VisitStmt_(op);
+  }
+
 private:
   bool ArrayDeepEqual(const Array<PrimExpr> &lhs, const Array<PrimExpr> &rhs) {
     if (lhs.size() != rhs.size()) {
diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc
index a9f52f41d61b243ede9b6c2c58efa540cdc7a95b..bfdcb5cd55cd6e2cb666c8d2a0ccc5b61974d0d7 100644
--- a/src/transform/split_host_device.cc
+++ b/src/transform/split_host_device.cc
@@ -33,28 +33,65 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include "../op/builtin.h"
+#include "common/assume.h"
 #include "tir/analysis/var_use_def_analysis.h"
+#include "tvm/node/cast.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/tir/stmt.h"
 
 namespace tvm {
 namespace tl {
 using namespace ffi;
 namespace tir = tvm::tir;
 
+// This pass traverses the AST, split the target function into host part and
+// device part and copies all assume attribute statements to the device side.
+
+// 1. Traverse AST and collect all assume statements into host_assumes_.
+// 2. Until the first AttrStmtNode with tvm::attr::kTarget.
+// 3. Call SplitDeviceFunc, which will create a new device function and replace
+//    the original body with a call to that function.
 class HostDeviceSplitter : public tir::StmtMutator {
 public:
   explicit HostDeviceSplitter(IRModule *device_mod,
                               std::function<GlobalVar()> var_supply)
       : device_mod_(device_mod), var_supply_(std::move(var_supply)) {}
 
+  void SetNonRestrictParams(Optional<Array<tir::Var>> params) {
+    for (auto param : params.value()) {
+      non_restrict_params_.push_back(param);
+    }
+  }
+
   tir::Stmt VisitStmt_(const tir::AttrStmtNode *op) final {
     if (op->attr_key == tvm::attr::kTarget) {
       found_device_region_ = true;
       auto device_target = op->node.as<tvm::Target>().value().WithoutHost();
       return SplitDeviceFunc(op->body, device_target);
+    } else if (op->attr_key == tir::attr::tilelang_assume) {
+      // NOTE(chaofan): the assumes collected here must be in host-side.
+      //    This is because when the collector reaches the split region,
+      //    it will start to split and return. For safety, we add a check here.
+      ICHECK(!found_device_region_)
+          << "Assumes collection should not be in device region.";
+      // We first push back the outside assume, then visit the child.
+      // So when moving assumes to device side, we need to do the building
+      // process in a reverse order.
+      host_assumes_.push_back(op);
     }
     return tir::StmtMutator::VisitStmt_(op);
   }
 
+  tir::Stmt VisitStmt_(const tir::EvaluateNode *op) final {
+    auto stmt = GetRef<tir::Stmt>(op);
+    // There should be no assume in evaluate form after InjectAssumes.
+    ICHECK(!IsAssumeInEvaluateForm(stmt))
+        << "Unexpected assume in evaluate form. Please run InjectAssumes pass "
+           "first.";
+    return tir::StmtMutator::VisitStmt_(op);
+  }
+
   tir::Stmt ForceSplit(tir::Stmt body, tvm::Target device_target) {
     return SplitDeviceFunc(std::move(body), std::move(device_target));
   }
@@ -63,8 +100,18 @@ public:
 
 private:
   bool found_device_region_{false};
+  Array<tir::Var> non_restrict_params_;
+
+  Stmt wrapBodyWithHostSideAssumes(Stmt body) {
+    for (auto it = host_assumes_.rbegin(); it != host_assumes_.rend(); ++it) {
+      body =
+          AttrStmt((*it)->node, tir::attr::tilelang_assume, (*it)->value, body);
+    }
+    return body;
+  }
 
   tir::Stmt SplitDeviceFunc(tir::Stmt body, tvm::Target device_target) {
+
     auto [params, buffers_to_declare] =
         [&]() -> std::tuple<Array<tir::Var>, Array<tir::Buffer>> {
       tir::VarUseDefAnalyzer use_def(/*defined_vars=*/{},
@@ -104,14 +151,21 @@ private:
       kernel_ret_type = VoidType();
     }
 
+    // Declare necessary buffers for the device side.
     for (tir::Buffer buf : buffers_to_declare) {
       body = tir::DeclBuffer(buf, std::move(body));
     }
+
+    // Copy assumes from host-side to device-side.
+    body = wrapBodyWithHostSideAssumes(body);
+
     tir::PrimFunc device_func(params, body, kernel_ret_type);
     device_func =
-        WithAttrs(std::move(device_func), {{tvm::attr::kTarget, device_target},
-                                           {tir::attr::kNoAlias, true},
-                                           {tir::attr::kIsGlobalFunc, true}});
+        WithAttrs(std::move(device_func),
+                  {{tvm::attr::kTarget, device_target},
+                   {tir::attr::kNoAlias, true},
+                   {tir::attr::kIsGlobalFunc, true},
+                   {tl::attr::kNonRestrictParams, non_restrict_params_}});
 
     GlobalVar kernel_symbol_global = var_supply_();
     (*device_mod_)->Add(kernel_symbol_global, device_func);
@@ -138,11 +192,20 @@ private:
   IRModule *device_mod_;
   // Generate new GlobalVar for the kernel
   std::function<GlobalVar()> var_supply_;
+  // Collect assumes in host side
+  Array<const tir::AttrStmtNode *> host_assumes_;
 };
 
 tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
                               std::function<GlobalVar()> var_supply) {
   HostDeviceSplitter splitter(device_mod, std::move(var_supply));
+  // Propagate non-restrict parameter list from host func to device kernels
+  if (auto opt = func->GetAttr<Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    splitter.SetNonRestrictParams(opt.value());
+    // Remove the attribute from host-side PrimFunc; it only matters for device
+    // codegen.
+    func = tvm::WithoutAttr(std::move(func), tl::attr::kNonRestrictParams);
+  }
 
   if (auto body = splitter(func->body); !body.same_as(func->body)) {
     func.CopyOnWrite()->body = body;
@@ -159,7 +222,6 @@ tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
       }
     }
   }
-
   return func;
 }
 
@@ -190,7 +252,6 @@ tvm::transform::Pass SplitHostDevice() {
         }
       }
     }
-
     mod->Update(updates);
     mod->Update(device_mod);
     return tir::transform::ConvertSSA()(mod);
@@ -207,4 +268,4 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 } // namespace transform
 } // namespace tl
-} // namespace tvm
+} // namespace tvm
\ No newline at end of file
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc
index fd02c0240c2145d3158439cca06a7146dab3b3b3..8e891d8551bd2d52e2f0ea7233256bf8cecf0d17 100644
--- a/src/transform/warp_specialized_rewriter.cc
+++ b/src/transform/warp_specialized_rewriter.cc
@@ -50,6 +50,7 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
 public:
   auto FindProducerusedBuffer(const Stmt &stmt) {
     producer_buffers_.clear();
+    let_var_to_expr_.clear();
     std::unordered_set<const BufferNode *> last_producer_buffers_;
     for (;;) {
       VisitStmt(stmt);
@@ -68,6 +69,28 @@ public:
     for (const auto &buffer : usage.buffer_use_count_) {
       producer_buffers_.insert(buffer.first);
     }
+    // Also collect buffers through let bindings
+    CollectBuffersFromExpr(expr);
+  }
+
+  // Collect buffers from expression, following let bindings
+  void CollectBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        producer_buffers_.insert(bl->buffer.get());
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        auto it = let_var_to_expr_.find(var.get());
+        if (it != let_var_to_expr_.end()) {
+          CollectBuffersFromExpr(it->second);
+        }
+      }
+    });
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    let_var_to_expr_[op->var.get()] = op->value;
+    StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const IfThenElseNode *op) final {
@@ -102,15 +125,15 @@ public:
   void VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
       for (auto arg : op->args) {
-        if (auto buffer_load = arg.as<BufferLoadNode>()) {
-          producer_buffers_.insert(buffer_load->buffer.get());
-        }
+        // Collect buffers from args, including through let bindings
+        CollectBuffersFromExpr(arg);
       }
     }
   }
 
 private:
   std::unordered_set<const BufferNode *> producer_buffers_;
+  std::unordered_map<const VarNode *, PrimExpr> let_var_to_expr_;
 };
 
 class WarpSpecializedRoleMarker : public StmtVisitor {
diff --git a/testing/conftest.py b/testing/conftest.py
index 9f49d40a9b50e14e41915811589d0011d3c2c910..4010e0d83ae84c641151d6dd56dbf40ee42e301f 100644
--- a/testing/conftest.py
+++ b/testing/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index a01bd45963252f1e0beadab5ecdd6d3c3002bace..b26354830a3069b1d1cc6ed6dad967c688489ace 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -1,10 +1,12 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tilelang.intrinsics import make_mfma_swizzle_layout as make_swizzle_layout
 from tilelang.intrinsics.mfma_macro_generator import (
-    MatrixCoreIntrinEmitter,)
+    MatrixCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 tilelang.testing.set_random_seed(0)
@@ -22,10 +24,9 @@ def tl_matmul(
     b_transposed=True,
     k_pack=1,
 ):
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -78,12 +79,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -91,10 +91,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -102,7 +104,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -116,7 +117,6 @@ def tl_matmul(
                     T.copy(B[ko * block_K, bx * block_N], B_shared)
 
                 for ki in T.serial(0, (block_K // (k_pack * micro_size_k))):
-
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -160,17 +160,8 @@ def tl_matmul(
     return main
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack)
+def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.float32, a_transposed=False, b_transposed=True, k_pack=1):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -178,10 +169,10 @@ def assert_tl_matmul_correctness(M,
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
-    elif in_dtype == "float8_e4m3fnuz":
+    elif in_dtype == T.float8_e4m3fnuz:
         A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
@@ -201,16 +192,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -220,24 +208,37 @@ def assert_tl_matmul_correctness(M,
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack",
+    [
+        (128, 128, 128, T.float16, T.float16, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 2),
+        (128, 128, 128, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 2),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 2),
+        (128, 128, 128, T.float8_e4m3fnuz, T.float16, T.float32, False, True, 1),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32")
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3fnuz", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False)
+def test_assert_tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack):
     assert_tl_matmul_correctness(
-        128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False, k_pack=2)
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+    )
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False, k_pack=2)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
index b215f0d45c879413330f0553af239813ce3808b0..dc95eb7010b90464676ce0320bd5fdcb17adbf4d 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
@@ -23,10 +24,9 @@ def tl_matmul(
     b_preshuffle=False,
     b_g2l_load=False,
 ):
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -53,18 +53,21 @@ def tl_matmul(
 
     A_shape = (K, M) if a_transposed else (M, K)
     if b_preshuffle:
-        B_shape = (N // micro_size_y, K // pack_size_k, micro_size_y,
-                   pack_size_k) if b_transposed else (K // pack_size_k, N // micro_size_y,
-                                                      pack_size_k, micro_size_y)
+        B_shape = (
+            (N // micro_size_y, K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (K // pack_size_k, N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shape = (N, K) if b_transposed else (K, N)
 
     A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
     if b_preshuffle:
-        B_shared_shape = (block_N // micro_size_y, block_K // pack_size_k, micro_size_y,
-                          pack_size_k) if b_transposed else (block_K // pack_size_k,
-                                                             block_N // micro_size_y, pack_size_k,
-                                                             micro_size_y)
+        B_shared_shape = (
+            (block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shared_shape = (block_N, block_K) if b_transposed else (block_K, block_N)
 
@@ -94,21 +97,22 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             num_ko = K // block_K
             num_ki = block_K // (k_pack * micro_size_k)
@@ -119,7 +123,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined(num_ko, num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -129,20 +132,13 @@ def tl_matmul(
                 # Load B into shared memory
                 if b_g2l_load is False:
                     if b_transposed:
-                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y,
-                                                       block_K // pack_size_k, micro_size_y,
-                                                       pack_size_k):
-                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j,
-                                                       ko * block_K // pack_size_k + k, jj, kk]
+                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k):
+                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j, ko * block_K // pack_size_k + k, jj, kk]
                     else:
-                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k,
-                                                       block_N // micro_size_y, pack_size_k,
-                                                       micro_size_y):
-                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k,
-                                                       bx * block_N // micro_size_y + j, kk, jj]
+                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y):
+                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k, bx * block_N // micro_size_y + j, kk, jj]
 
                 for ki in T.serial(0, num_ki):
-
                     # Load A S2L
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -176,10 +172,10 @@ def tl_matmul(
 
 
 def shuffle_weight(
-        x: torch.Tensor,
-        layout=(16, 32),
-        k_pack=1,
-        is_transpose=False,
+    x: torch.Tensor,
+    layout=(16, 32),
+    k_pack=1,
+    is_transpose=False,
 ) -> torch.Tensor:
     IN, IK = layout
     BK = IK * k_pack
@@ -194,19 +190,20 @@ def shuffle_weight(
     return x.contiguous()
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1,
-                                 b_preshuffle=False,
-                                 b_g2l_load=False):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack, b_preshuffle, b_g2l_load)
+def assert_tl_matmul_correctness(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype=T.float32,
+    a_transposed=False,
+    b_transposed=True,
+    k_pack=1,
+    b_preshuffle=False,
+    b_g2l_load=False,
+):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -214,10 +211,10 @@ def assert_tl_matmul_correctness(M,
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
-    elif in_dtype == "float8_e4m3fnuz":
+    elif in_dtype == T.float8_e4m3fnuz:
         A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
@@ -244,16 +241,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -264,42 +258,46 @@ def assert_tl_matmul_correctness(M,
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load",
+    [
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 2, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 2, True, False),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", b_transposed=False, accum_dtype="int32", b_preshuffle=True)
-
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", k_pack=2, b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256,
-        256,
-        512,
-        "int8",
-        "int32",
-        b_transposed=False,
-        accum_dtype="int32",
-        k_pack=2,
-        b_preshuffle=True)
-
-    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "float8_e4m3fnuz", "float32", b_transposed=False, b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "float8_e4m3fnuz", "float32", k_pack=2, b_preshuffle=True)
+def test_assert_tl_matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    a_transposed,
+    b_transposed,
+    k_pack,
+    b_preshuffle,
+    b_g2l_load,
+):
     assert_tl_matmul_correctness(
-        256,
-        256,
-        512,
-        "float8_e4m3fnuz",
-        "float32",
-        k_pack=2,
-        b_transposed=False,
-        b_preshuffle=True)
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+        b_preshuffle=b_preshuffle,
+        b_g2l_load=b_g2l_load,
+    )
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_test_amd.py b/testing/python/amd/test_tilelang_test_amd.py
index 456a3ae46a32e94a7ed1b6a4a1c3690da3831bbe..4035c299c3f56fd38fb8486190c5bfeeb8215cd4 100644
--- a/testing/python/amd/test_tilelang_test_amd.py
+++ b/testing/python/amd/test_tilelang_test_amd.py
@@ -1,3 +1,4 @@
+import pytest
 from tilelang import tvm as tvm
 import tilelang as tl
 import tilelang.language as T
@@ -27,8 +28,7 @@ def matmul(
     vec_size = 4 * k_pack
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -96,33 +96,49 @@ def run_gemm(
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_f16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_f16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.float16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_bf16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16bf16f32():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_bf16bf16f32(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
 def matmul_rs(
@@ -149,9 +165,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -225,24 +241,24 @@ def run_gemm_rs(
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_f16f32f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.float16, T.float32, T.float32, 128, 128, 32)
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_bf16f32f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_bf16bf16f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
 
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/analysis/test_tilelang_fragment_loop_checker.py b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..99458f1c85521753fcd71ac678abd9b3906845ca
--- /dev/null
+++ b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
@@ -0,0 +1,151 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import pytest
+
+
+@tilelang.jit
+def simple_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[i] = 0
+
+    return main
+
+
+@tilelang.jit
+def nested_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A // 64):
+                for j in T.Parallel(64):
+                    data_frag[i * 64 + j] = 0
+
+    return main
+
+
+@tilelang.jit
+def invalid_loop_with_complex_dataflow(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[64 // 2 + i % 64] = 0
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_use_loop_var(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):  # noqa: B007
+                for j in T.Parallel(64):
+                    data_frag[j] = 0  # This is valid because we don't use i
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_frag(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_shared[i] = 0  # Valid because this is shared memory
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_serial(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.serial(A):
+                data_shared[i] = 0  # Valid because this is serial
+
+    return main
+
+
+def test_invalid_loop():
+    with pytest.raises(ValueError):
+        simple_invalid_loop()
+    with pytest.raises(ValueError):
+        nested_invalid_loop()
+    with pytest.raises(ValueError):
+        invalid_loop_with_complex_dataflow()
+
+
+def test_valid_loop():
+    valid_loop_not_use_loop_var()
+    valid_loop_not_frag()
+    valid_loop_serial()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/analysis/test_tilelang_nested_loop_checker.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..664fda5b81d6622017a6787d47f083a64b3c88f7
--- /dev/null
+++ b/testing/python/analysis/test_tilelang_nested_loop_checker.py
@@ -0,0 +1,719 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+import pytest
+
+tilelang.testing.set_random_seed()
+
+
+def _require_cuda_tensor(shape, dtype=torch.float32):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randn(*shape, device="cuda", dtype=dtype)
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+"""
+Nested Parallel cases:
+
+T.Parallel
+    T.Parallel
+
+Rule:
+    - continuous parallels is allowed and will be merged into one T.Parallel.
+    - Non-continuous (e.g. with some statements in the outer-loop) are forbidden.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_parallels(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_parallels(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                B[i] = 0
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_parallels():
+    kernel1 = nested_continuous_parallels(length=256, block=16)
+    kernel2 = nested_triple_continuous_parallels(length=256, block1=8, block2=2)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is invalid
+    with pytest.raises(ValueError):
+        nested_noncontinuous_parallels(length=256, block=16)
+
+
+"""
+Nested Pipeline cases:
+
+T.Pipeline
+    T.Pipeline
+
+is OK.
+"""
+
+
+def matmul_nested_pipelines(
+    M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, threads, order, stage, extra_pipeline_repeats
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            for _ in T.Pipelined(extra_pipeline_repeats):
+                T.clear(C_local)
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    if trans_A:
+                        T.copy(A[k * block_K, by * block_M], A_shared)
+                    else:
+                        T.copy(A[by * block_M, k * block_K], A_shared)
+                    if trans_B:
+                        T.copy(B[bx * block_N, k * block_K], B_shared)
+                    else:
+                        T.copy(B[k * block_K, bx * block_N], B_shared)
+                    T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_nested_pipelines(
+    order,
+    stage,
+    extra_pipeline_repeats,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    trans_A = False
+    trans_B = False
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+    program = matmul_nested_pipelines(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+        extra_pipeline_repeats,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_nested_pipelines():
+    run_gemm_nested_pipelines(order=[0, 1, 2], stage=[0, 0, 1], extra_pipeline_repeats=3)
+
+
+"""
+Nested serial cases:
+
+T.serial
+    T.serial
+
+is OK.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_serials(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_serials(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                B[i] = 0
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_serials():
+    kernel1 = nested_continuous_serials(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is valid
+    nested_noncontinuous_serials(length=256, block=16)
+
+
+"""
+Mixed serial and Parallel loops:
+
+(S-P)
+T.serial
+    T.Parallel
+
+(P-S)
+T.Parallel
+    T.serial
+
+Rule:
+    - No Parallel - * - Parallel
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sp(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_ps(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_psp(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.serial(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sps(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.serial(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+def test_mixed_sp():
+    kernel1 = nested_continuous_sp(length=256, block=16)
+    kernel2 = nested_continuous_ps(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This should be invalid (Undefined behaviour)
+    with pytest.raises(ValueError):
+        nested_continuous_psp(length=256, block1=16, block2=8)
+
+    kernel3 = nested_continuous_sps(length=256, block1=8, block2=2)
+    result3 = kernel3(data)
+    torch.testing.assert_close(result3, data + 1.0, atol=1e-5, rtol=1e-5)
+
+
+"""
+Mixed Pipelined and Parallel loops:
+
+(Pi-Pa)
+T.Pipelined
+    T.Parallel
+
+(Pa-Pi)
+T.Parallel
+    T.Pipelined
+
+Rule:
+    - Pi-Pa is ok where Pa-Pi is not allowed.
+    - For more nested cases, refer to the rule of T.Parallel.
+"""
+
+
+def matmul_nested_pipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmul_nested_papipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for _ in T.Parallel(1):
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    for i, j in T.Parallel(block_M, block_K):
+                        A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                    for i, j in T.Parallel(block_K, block_N):
+                        B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                    # T.copy(A[by * block_M, k * block_K], A_shared)
+                    # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_mixed_pp(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_nested_papipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            },
+        )
+
+
+def test_mixed_pp():
+    run_gemm_mixed_pp(order=[0, 1, 2], stage=[0, 0, 1])
+
+
+"""
+TiledOp in a T.Parallel is also not permitted.
+"""
+
+
+def matmul_with_parallel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                for _ in T.Parallel(1):
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_tiled_op_with_parallel(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_with_parallel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            },
+        )
+
+
+@tilelang.jit(out_idx=[1])
+def tir_op_with_parallel(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = T.max(A[i * block + j], 0.0)
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def customize_op_with_parallel(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j]
+                    T.atomic_add(B[i * block + j], 1.0)
+
+    return main
+
+
+def test_tiled_op_with_parallel():
+    run_gemm_tiled_op_with_parallel(order=[0, 1, 2], stage=[0, 0, 1])
+
+    kernel1 = tir_op_with_parallel(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, torch.relu(data), atol=1e-5, rtol=1e-5)
+    kernel2 = customize_op_with_parallel(length=256, block=16)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result2, data + 1, atol=1e-5, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_hard.py b/testing/python/arith/test_arith_hard.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fc859ba63d3f6803e258ce1fdad0711a09568df
--- /dev/null
+++ b/testing/python/arith/test_arith_hard.py
@@ -0,0 +1,105 @@
+import tilelang.testing
+import tilelang.language as T
+from tvm.arith import Analyzer
+from tvm.ir.expr import Range
+from tvm.tir.expr import Not, Or
+
+
+def implies(x, y):
+    return Or(Not(x), y)
+
+
+def test_hard_prove():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+    d = T.Var("d", T.int32)
+
+    def check_expr(expr):
+        analyzer = Analyzer()
+        result = analyzer.can_prove(expr, 1)
+        if not result:
+            smtlib2 = analyzer.get_smtlib2(expr)
+            raise AssertionError(f"Failed to prove: {expr}\nSMT-LIB2:\n{smtlib2}")
+        # assert result, f"Failed to prove: {expr}"
+
+    @T.macro
+    def complex_expr_1():
+        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+
+    check_expr(complex_expr_1())
+
+    @T.macro
+    def complex_expr_2():
+        return implies(a < b and b < c and a * d < b * d, b * d < c * d)
+
+    check_expr(complex_expr_2())
+
+    @T.macro
+    def complex_expr_3():
+        return implies(a >= 0 and a < 128, a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
+
+    check_expr(complex_expr_3())
+
+    @T.macro
+    def complex_expr_4():
+        return implies(
+            a >= 0 and a < 128,
+            (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 + 16 - (a // 64 + a % 8 // 4) // 2 * 64) // 512
+            == (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 - (a // 64 + a % 8 // 4) // 2 * 64) // 512,
+        )
+
+    check_expr(complex_expr_4())
+
+
+def test_smtlib2():
+    import z3
+
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    @T.macro
+    def complex_expr_1():
+        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+
+    e = complex_expr_1()
+    analyzer = Analyzer()
+    analyzer.set_z3_timeout_ms(1000)
+    smtlib2 = analyzer.get_smtlib2(e)
+
+    solver = z3.Solver()
+    solver.from_string(smtlib2)
+    assert solver.check() == z3.unsat, f"Expected unsat, got {solver.check()}"
+
+
+def test_bind():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    analyzer = Analyzer()
+    analyzer.bind(a, Range(1, 100000))
+    analyzer.bind(b, Range(1, 100000))
+    analyzer.bind(c, Range(1, 100000))
+
+    expr = ((b - a) // c) * c + a <= b
+    smtlib2 = analyzer.get_smtlib2(expr)
+    try:
+        result = analyzer.can_prove(expr, 1)
+        assert result, f"Failed to prove with bindings: {expr}"
+    except Exception as e:
+        print(smtlib2)
+        raise e
+
+
+def test_divmod():
+    analyzer = Analyzer()
+    a = T.Var("a", T.int32)
+
+    assert not analyzer.can_prove(a % 2 % -2 - a % 2 == 0)
+    assert analyzer.can_prove(a % -2 % 2 - a % 2 == 0)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_intset.py b/testing/python/arith/test_arith_intset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3fc7889ff8cade9633e58da2ec79647b4d679ba
--- /dev/null
+++ b/testing/python/arith/test_arith_intset.py
@@ -0,0 +1,379 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tvm.testing
+from tvm import te
+from tvm import tir
+from tvm.arith.analyzer import Analyzer
+
+
+class IntSetChecker:
+    def __init__(self):
+        self.analyzer = tvm.arith.Analyzer()
+
+    def verify(self, data, dmap, expected):
+        res = self.analyzer.int_set(data, dmap)
+
+        def err_msg():
+            return "\ndata={}\ndmap={}\nres={}\nexpected={}".format(data, dmap, res, expected)
+
+        assert self.analyzer.can_prove_equal(res.min_value, expected[0]), err_msg()
+        assert self.analyzer.can_prove_equal(res.max_value, expected[1]), err_msg()
+
+
+def test_basic():
+    s = tvm.arith.IntervalSet(2, 3)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 3
+
+    s = tvm.arith.IntSet.single_point(2)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 2
+
+
+def test_vector():
+    base = 10
+    stride = 3
+    lanes = 2
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, stride, lanes))
+    assert s.min_value.value == base
+    assert s.max_value.value == base + stride * (lanes - 1)
+
+
+def test_scalable_vector():
+    base = 5
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, 2, tvm.tir.vscale() * 4))
+
+    assert s.min_value.value == base
+    assert s.max_value.same_as(tvm.arith.int_set.pos_inf())
+
+
+def test_add_sub():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10)}, (y, 10 + y))
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (1, 21))
+    ck.verify(x - y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (-11, 9))
+
+
+def test_mul_div():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+
+    tdiv = tvm.tir.truncdiv
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(x * y, {x: tvm.arith.IntervalSet(0, 10)}, (0, 10 * y))
+    ck.verify(x * 2, {x: tvm.arith.IntervalSet(1, 10)}, (2, 20))
+    ck.verify(x * -2, {x: tvm.arith.IntervalSet(1, 10)}, (-20, -2))
+
+    ck.verify(tdiv(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, tdiv(10, y)))
+    ck.verify(tdiv(x, 2), {x: tvm.arith.IntervalSet(1, 10)}, (0, 5))
+
+    fld = tvm.te.floordiv
+    ck.verify(fld(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, fld(10, y)))
+    ck.verify(fld(x, 2), {x: tvm.arith.IntervalSet(-1, 10)}, (-1, 5))
+
+
+def test_mod():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    tmod = tvm.tir.truncmod
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(tmod(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, y - 1))
+    ck.verify(tmod(x, 10), {x: tvm.arith.IntervalSet(1, 10)}, (0, 9))
+
+    flm = tvm.te.floormod
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(-10, 10)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 5)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(13, 15)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 15)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 11)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(1, 21)}, (0, 9))
+
+    fld = tvm.te.floordiv
+    z = te.var("z")
+    ck.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 3))
+    ck.verify(
+        flm(y, 8),
+        {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)},
+        (
+            z * 8 + x * 4 - 8 * fld(z * 8 + x * 4, 8),
+            z * 8 + x * 4 + 3 - 8 * fld(z * 8 + x * 4, 8),
+        ),
+    )
+    ck1 = IntSetChecker()
+    ck1.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 2))
+    ck1.verify(flm(y, 8), {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)}, (x * 4, x * 4 + 3))
+
+
+def test_max_min():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(tvm.te.max(x, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (1, 11))
+    ck.verify(tvm.te.min(x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 9))
+    ck.verify(tvm.te.min(x, y), {}, (tvm.te.min(x, y), tvm.te.min(x, y)))
+    ck.verify(tvm.te.max(x, y), {}, (tvm.te.max(x, y), tvm.te.max(x, y)))
+
+
+def test_select():
+    ck = IntSetChecker()
+    # x, y = te.var("x"), te.var("y")
+    x = te.var("x")
+    ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11))
+
+
+def check_region_bound(expect_region, var_dom, mode, predicate=None):
+    """Helper to check region bound estimation.
+
+    Parameters
+    ----------
+    expect_region: dict
+        The keys are of form (begin, end) or PrimExpr as a single point. The values are
+        expected estimated region or region dict on different bindings.
+
+    var_dom: dict
+        Map var to iteration domain range.
+
+    mode: str
+        Specify "lowerbound", "upperbound" or else use strict bound estimation.
+
+    predicate: PrimExpr
+        Extra predicate, defaults to True.
+    """
+    if predicate is None:
+        predicate = tvm.tir.IntImm("bool", 1)
+    region = []
+    expect = []
+    for k, v in expect_region.items():
+        if not isinstance(k, (tuple, list)):
+            k = (k, k + 1)
+        region.append(tvm.ir.Range.from_min_extent(k[0], Analyzer().simplify(k[1] - k[0])))
+        expect.append(v)
+    if mode == "lowerbound":
+        result = tvm.arith.estimate_region_lower_bound(region=region, var_dom=var_dom, predicate=predicate)
+    elif mode == "upperbound":
+        result = tvm.arith.estimate_region_upper_bound(region=region, var_dom=var_dom, predicate=predicate)
+    else:
+        result = tvm.arith.estimate_region_strict_bound(region=region, var_dom=var_dom, predicate=predicate)
+    if result is None:
+        assert all([_ is None for _ in expect])
+        return
+    assert len(result) == len(expect)
+    for intset, expect_desc in zip(result, expect):
+        if isinstance(expect_desc, dict):
+            # check range on different free var bindings
+            for binding in expect_desc:
+                analyzer = Analyzer()
+                for k, v in binding:
+                    analyzer.bind(k, v)
+                expect_begin, expect_end = expect_desc[binding]
+                result_begin = analyzer.simplify(intset.min_value, 3)
+                result_end = analyzer.simplify(intset.max_value + 1, 3)
+                assert analyzer.can_prove_equal(result_begin - expect_begin, 0), f"{result_begin} vs {expect_begin}"
+                assert analyzer.can_prove_equal(result_end - expect_end, 0), f"{result_end} vs {expect_end}"
+        else:
+            # check range
+            expect_begin, expect_end = expect_desc
+            analyzer = Analyzer()
+            assert analyzer.can_prove_equal(intset.min_value - expect_begin, 0), f"{intset.min_value} vs {expect_begin}"
+            assert analyzer.can_prove_equal(intset.max_value - expect_end + 1, 0), f"{intset.max_value} vs {expect_end - 1}"
+
+
+def test_region_bound_not_independent():
+    # (i, i+2) and (i+2, i+4) are dependent, this the lowerbound is not available
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i, i + 2): None, (i + 2, i + 4): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i, i + 2): (0, 65), (i + 2, i + 4): (2, 67)}, var_dom, mode="upperbound")
+
+    # when only a subset of access indices are affine
+    i, j, k = tvm.tir.Var("i", "int32"), tvm.tir.Var("j", "int32"), tvm.tir.Var("k", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=16),
+        j: tvm.ir.Range(begin=0, end=16),
+        k: tvm.ir.Range(begin=0, end=16),
+    }
+    check_region_bound(
+        {i // 4: None, j * 4 + i % 4: None, tir.truncdiv(k, 2): None},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {i // 4: (0, 4), j * 4 + i % 4: (4, 64), tir.truncdiv(k, 2): (0, 8)},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="upperbound",
+    )
+
+
+def test_region_bound_stride_too_wide():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {i: tvm.ir.Range(begin=0, end=64)}
+    check_region_bound({(i * 4, i * 4 + 2): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i * 4, i * 4 + 2): (0, 254)}, var_dom, mode="upperbound")
+
+
+def test_region_bound_small_stride():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i * 4, i * 4 + 8): (0, 260)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_split_predicate():
+    x_o = tvm.tir.Var("xo", "int32")
+    x_i = tvm.tir.Var("xi", "int32")
+    x = x_o * 4 + x_i
+    var_dom = {
+        x_o: tvm.ir.Range(begin=0, end=16),
+        x_i: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(x * 4, x * 4 + 8): (0, 256)}, var_dom, predicate=x < 63, mode="lowerbound")
+
+    check_region_bound(
+        {(x * 4, x * 4 + 8): (0, 256), (x * 3, x * 3 + 5): (0, 191)},
+        var_dom,
+        predicate=x < 63,
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_multiple_variables():
+    div = tvm.tir.floordiv
+    mod = tvm.tir.floormod
+    x = tvm.tir.Var("x", "int32")
+    wid = tvm.tir.Var("wid", "int32")
+    i = div(x, 16)
+    j = div(mod(x, 16), 4) * 8 + mod(x, 4) + div(wid, 32) * 4
+    k = wid % 32
+    var_dom = {
+        x: tvm.ir.Range(begin=0, end=32),
+        wid: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({i: (0, 2), j: (0, 32), k: (0, 32)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_negative_scale():
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=4),
+        j: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(1 - i, 5 - i): (-2, 5), (20 - j * 4, 36 - j * 4): (8, 36)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_for_non_perfect_tile():
+    h1 = tvm.tir.Var("h1", "int32")
+    h2 = tvm.tir.Var("h2", "int32")
+    h3 = tvm.tir.Var("h3", "int32")
+
+    # non-uniform tiling, single inner variable
+    var_dom = {
+        h2: tvm.ir.Range(begin=0, end=10),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),  # h3 == 0: region is [1, 10)
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),  # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 10)
+                ((h3, 27),): (h3 * 8, 224),  # h3 > 26: region is [h3 * 8, 224)
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 >= 1, h3 * 8 + h2 < 224),
+        mode="lowerbound",
+    )
+
+    # non-uniform tiling, two inner variables
+    var_dom = {
+        h1: tvm.ir.Range(begin=0, end=5),
+        h2: tvm.ir.Range(begin=0, end=2),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2 * 5 + h1: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),
+                ((h3, 27),): (h3 * 8, 224),
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h2 * 5 + h1 < 224),
+        mode="lowerbound",
+    )
+
+    # lowerbound should fail on incompatible predicates
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: None},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: (h3 * 8, h3 * 8 + 10)},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_unfusable():
+    var_dom = {
+        tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
+        tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
+    }
+    i, j = var_dom
+    check_region_bound({(i + j) // 2: (0, 6)}, var_dom, mode="lowerbound")
+
+
+def test_union_lower_bound():
+    neg_inf = tvm.arith.int_set.neg_inf()
+    pos_inf = tvm.arith.int_set.pos_inf()
+    set_0 = tvm.arith.IntervalSet(min_value=neg_inf, max_value=0)
+    set_1 = tvm.arith.IntervalSet(min_value=1, max_value=pos_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+    set_2 = tvm.arith.IntervalSet(min_value=pos_inf, max_value=neg_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1, set_2])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+
+
+def test_modular_set():
+    ck = IntSetChecker()
+    x = tvm.te.var("x", dtype="int32")
+    y = tvm.te.var("y", dtype="int32")
+    expr = (x * 2048 + y * 16) % 7168
+    ck.verify(expr, {x: tvm.arith.IntervalSet(0, 128), y: tvm.arith.IntervalSet(0, 3584)}, (0, 7152))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/testing/python/arith/test_arith_iter_affine_map.py b/testing/python/arith/test_arith_iter_affine_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a666f87d7ef7a822e99af27a42ad6825b8dd8a9
--- /dev/null
+++ b/testing/python/arith/test_arith_iter_affine_map.py
@@ -0,0 +1,1292 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tilelang.testing
+from tvm.tir import floordiv, floormod
+from tvm.script import tir as T
+
+
+def ifuse(inputs, pred_extent=None):
+    """Fuse iterators"""
+    value, extent = 0, 1
+    for i, ext in inputs:
+        value = value * ext + i
+        extent = extent * ext
+    return value, extent if pred_extent is None else pred_extent
+
+
+def isplit(axis, factor):
+    """Split iterators"""
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    return [
+        (fld(axis[0], factor), fld(axis[1] + (factor - 1), factor)),
+        (flm(axis[0], factor), factor),
+    ]
+
+
+def var_dom(iters):
+    """Get domains of iterators"""
+    return {var: tvm.ir.Range(0, ext) for var, ext in iters}
+
+
+def convert_iter_expr(expr):
+    return tvm.arith.normalize_iter_map_to_expr(expr)
+
+
+def assert_iter_sum_pattern(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    res = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    indices = res.indices
+    assert len(indices) == len(keys), res.errors
+    for i, input_iter in enumerate(keys):
+        spec = expect_dict[input_iter]
+        (
+            extent,
+            base,
+        ) = spec[0:2]
+        scale = spec[2] if len(spec) > 2 else 1
+        expect_iter = spec[3] if len(spec) > 3 else None
+        sum_expr = indices[i]
+        assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+        if extent == 1:
+            assert len(sum_expr.args) == 0
+        else:
+            assert len(sum_expr.args) == 1
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+        tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+        if expect_iter is not None:
+            if not isinstance(expect_iter, tvm.arith.IterMapExpr):
+                sum_expr = convert_iter_expr(sum_expr)
+            tvm.ir.assert_structural_equal(sum_expr, expect_iter)
+
+
+def assert_iter_map_simplify(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    _imap = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    res = tvm.arith.iter_map_simplify(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    for i, input_expr in enumerate(keys):
+        expected_expr = expect_dict[input_expr]
+        tvm.ir.assert_structural_equal(res[i], expected_expr)
+
+
+def assert_iter_sum_failure(iters, dom_map, predicate=True, check_level="surjective"):
+    res = tvm.arith.detect_iter_map(list(iters), dom_map, predicate=predicate, check_level=check_level).indices
+    assert len(res) == 0
+
+
+def test_trivial():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    dom_map = var_dom([(x, 3), (y, 4), (z, 1)])
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0), 3: (1, 3)}, dom_map)
+    assert_iter_sum_pattern({x: (3, 0), 3: (1, 3)}, dom_map)
+
+    # not independent
+    assert_iter_sum_failure([x, x, 3], dom_map)
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=True)
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=False)
+    assert_iter_sum_failure([x, z], dom_map, check_level="bijective")
+
+
+def test_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+
+    assert_iter_sum_pattern({y * 3 + 1 + c + x: (12, 1 + c)}, var_dom([(x, 3), (y, 4)]))
+
+    assert_iter_sum_pattern({ifuse([(x, 3), (y, 4)])[0]: (12, 0)}, var_dom([(x, 3), (y, 4)]))
+
+    # fuse with symbolic factor
+    assert_iter_sum_pattern({(y + 1) * c + x: (4 * c, c)}, var_dom([(x, c), (y, 4)]))
+
+    # duplication
+    assert_iter_sum_failure([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert_iter_sum_failure([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
+
+    # factor mismatch
+    assert_iter_sum_failure([y * 4 + x], var_dom([(x, 3), (y, 4)]))
+
+    # simple stride pattern
+    assert_iter_sum_pattern({x * 4 + y * 2: (6, 0, 2, (x * 2 + y) * 2)}, var_dom([(x, 3), (y, 2)]))
+
+    # simple stride pattern with symbolic
+    assert_iter_sum_pattern({x * 2 * c0 + y * 2: (3 * c0, 0, 2, (x * c0 + y) * 2)}, var_dom([(x, 3), (y, c0)]))
+
+
+def test_split():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    assert_iter_sum_pattern({fld(x, 3): (8, 0), flm(x, 3) * 2 + c1: (3, c1, 2)}, var_dom([(x, 24)]))
+
+    assert_iter_sum_pattern({fld(x, 6): (4, 0), fld(flm(x, 6), 2): (3, 0), flm(x, 2): (2, 0)}, var_dom([(x, 24)]))
+
+    # simple symbolic bound
+    # TODO(tvm-team) improve symbolic divisible check to enable
+    # more complicated symbolic bound
+    assert_iter_sum_pattern({fld(x, c0): (c1, 0), flm(x, c0): (c0, 0)}, var_dom([(x, c1 * c0)]))
+
+    assert_iter_sum_pattern({fld(x * 2, 4): (4, 0, 1), flm(x * 2, 4): (2, 0, 2)}, var_dom([(x, 8)]))
+
+    assert_iter_sum_pattern(
+        {
+            fld(x * 2, 4) * 4 + flm(x * 2, 4): (8, 0, 2),
+        },
+        var_dom([(x, 8)]),
+    )
+
+    assert_iter_sum_failure([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
+
+    # domain of x is undefined
+    assert_iter_sum_pattern({fld(flm(x, 49) + y, 49): (1, fld(flm(x, 49) + y, 49))}, var_dom([(y, 1)]))
+
+
+def test_compound():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+
+    # reconstruct the pattern manually
+    mx = tvm.arith.IterMark(x, 10)
+    my = tvm.arith.IterMark(y, 9)
+    xoscale = 3
+    yoscale = 6
+    yiscale = 1
+    mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
+    myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
+    myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
+    mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
+    sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
+    assert_iter_sum_pattern({z[0]: (18, 0, 1, sz), xi[0]: (5, 0)}, var_dom([(x, 10), (y, 9)]))
+
+
+def test_compound_floormod_two_regression():
+    x = tvm.tir.Var("x", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    # regression
+    # extent of 2 of negative scale cannot be normalized
+    assert_iter_sum_failure(
+        [fld(x, 2) * 2 - flm(x, 2) + 1],
+        dom_map=var_dom([(x, 8)]),
+    )
+
+
+def test_predicate():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # available constraints
+    # upper bound only
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 128)
+
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y <= 127)
+
+    # lower bound only
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y > 5)
+
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y >= 6)
+
+    # lower bound + upper bound
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y > 5, x * 10 + y < 128),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y >= 6, x * 10 + y <= 127),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 64 + y * 4 + z: (16, 16)},
+        var_dom([(x, 16), (y, 16), (z, 4)]),
+        predicate=tvm.tir.And(x * 64 + y * 4 + z < 32, x * 16 + y >= 4),
+    )
+
+    # constraints on one fused iter
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    k = tvm.tir.Var("k", "int32")
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (88, 1)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9),
+    )
+
+    # constraints on single var
+    assert_iter_sum_pattern({i: (10, 0)}, var_dom([(i, 48)]), predicate=i < 10)
+
+    # iterations are subparts of constraint, invalid case 1
+    assert_iter_sum_failure(
+        [i, j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=tvm.tir.all(i * 16384 + j * 128 + k < 100),
+    )
+
+    # iterations are subparts of constraint, invalid case 2
+    assert_iter_sum_failure(
+        [i * 128 + j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=i * 16384 + j * 128 + k < 100,
+    )
+
+    # irrelevant predicate
+    assert_iter_sum_pattern({i + j: (1, j)}, var_dom([(i, 1)]), predicate=j <= 24)
+
+    # constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (22, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9, i * 8 + j * 2 + k >= 3, i * 8 + j * 2 + k < 25),
+    )
+
+    # duplicate constraint on one fused iter
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (66, 2)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k >= 2, j * 2 + k < 8, j * 2 + k < 9),
+    )
+
+    # duplicate constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (15, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(
+            j * 2 + k >= 1,
+            j * 2 + k >= 2,
+            j * 2 + k < 8,
+            j * 2 + k < 9,
+            i * 6 + j * 2 + k >= 3,
+            i * 6 + j * 2 + k < 25,
+            i * 6 + j * 2 + k >= 1,
+            i * 6 + j * 2 + k < 18,
+        ),
+    )
+
+    # constraint on non-disjoint fused iters should fail
+    assert_iter_sum_failure(
+        [i * 8 + j * 2 + k],
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 2, i * 4 + j >= 0),
+    )
+
+    # constraints with different lower bound
+    assert_iter_sum_pattern(
+        {
+            (i * 16 + j) // 23 * 8 + (i * 16 + j) % 23 - 15: (
+                64,
+                0,
+                1,
+                (i * 16 + j) // 23 * 8 + ((i * 16 + j) % 23 + tvm.tir.IntImm("int32", -15)),
+            )
+        },
+        var_dom([(i, 12), (j, 16)]),
+        predicate=tvm.tir.And(
+            tvm.tir.And(i * 16 + j < 184, tvm.tir.LE(tvm.tir.IntImm("int32", 8), (i * 16 + j) % 23)),
+            tvm.tir.LE(tvm.tir.IntImm("int32", 15), (i * 16 + j) % 23),
+        ),
+    )
+
+    # constraint on many disjoint fused iters, case 1
+    # i4 * 6 + i5 in [3, 9), extent=6 (= scale of i2)
+    # i2 * 30 + i3 * 15 in [30, 90), extent=60 (= scale of i1)
+    # i1 * 60 in [60, 240), extent=180 (= scale of i0)
+    i0 = tvm.tir.Var("i0", "int32")
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    i5 = tvm.tir.Var("i5", "int32")
+    assert_iter_sum_pattern(
+        {i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5: (540, 93)},
+        var_dom([(i0, 3), (i1, 4), (i2, 3), (i3, 2), (i4, 3), (i5, 6)]),
+        predicate=tvm.tir.all(i1 >= 1, i2 * 2 + i3 >= 2, i4 * 6 + i5 >= 3),
+    )
+
+    # constraint on many disjoint fused iters, case 2
+    assert_iter_sum_pattern(
+        {i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4: (135, 28)},
+        var_dom([(i0, 3), (i1, 2), (i2, 5), (i3, 3), (i4, 4)]),
+        predicate=tvm.tir.all(i1 * 5 + i2 >= 3, i1 * 5 + i2 < 8, i3 * 4 + i4 >= 1, i3 * 4 + i4 < 10),
+    )
+
+    # constraint on split iters
+    assert_iter_sum_pattern(
+        {i % 16: (7, 3), i // 16: (8, 4)},
+        var_dom([(i, 1024)]),
+        predicate=tvm.tir.all(i % 16 >= 3, i % 16 < 10, i // 16 >= 4, i // 16 < 12),
+        check_level="bijective",
+    )
+
+    # constraint on split iters, nested case 1
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (7, 3)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all((i * 32 + j) % 16 >= 3, (i * 32 + j) % 16 < 10),
+    )
+
+    # constraint on split iters, nested case 2
+    assert_iter_sum_failure(
+        [
+            (i * 32 + j) % 16,
+        ],
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+        check_level="bijective",
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (16, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j - 1) % 16: (16, 0), (i * 32 + j - 1) // 16: (4, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 64),
+    )
+
+    # non-standard form of predicate
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 < 128 - y)
+
+    # duplicate constraint
+    assert_iter_sum_pattern(
+        {x * 10 + y: (64, 0)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.all(x * 10 + y < 128, x * 10 + y < 64),
+    )
+
+    # useless constraint
+    assert_iter_sum_pattern({x * 10 + y: (130, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 140)
+
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    assert_iter_sum_pattern(
+        {i1 * 20 + i2 * 10 + i3 * 3 + i4: (128, 0)},
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+            )
+        ),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 7,
+            )
+        ),
+    )
+
+    # incompatible constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+
+    # zero iter
+    xo = tvm.tir.Var("xo", "int32")
+    xi = tvm.tir.Var("xi", "int32")
+    y = tvm.tir.Var("y", "int32")
+    assert_iter_sum_pattern(
+        {xo * 129 + xi: (128, 0), y: (128, 0)},
+        var_dom([(xo, 1), (xi, 129), (y, 128)]),
+        predicate=xo * 129 + xi < 128,
+    )
+
+    # strided iteration predicate
+    assert_iter_sum_pattern(
+        {xo * 16 + xi * 4: (10, 0, 4)},
+        var_dom([(xo, 3), (xi, 4)]),
+        predicate=xo * 4 + xi < 10,
+    )
+
+
+def convert_division(divisions):
+    if divisions is None or len(divisions) == 0:
+        return []
+    res = []
+    for division in divisions[:-1]:
+        res.append(
+            [
+                tvm.arith.normalize_iter_map_to_expr(division[0].source),
+                tvm.arith.normalize_iter_map_to_expr(division[1].source),
+            ]
+        )
+    res.append([divisions[-1][0].extent, divisions[-1][1].extent])
+    return res
+
+
+def create_iter(name, extent):
+    return tvm.tir.Var(name, "int32"), extent
+
+
+def test_subspace_division():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+
+    # simple 1.1
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x])
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+
+    # simple 1.2
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x], z * 4 + y < 18)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+    tvm.ir.assert_structural_equal(res[1][0], z * 4 + y < 18)
+    tvm.ir.assert_structural_equal(res[1][1], T.bool(True))
+
+    # compound 1
+    i0 = create_iter("i0", 4)
+    j0 = create_iter("j0", 8)
+    i3 = create_iter("i3", 2)
+
+    i1, i2 = isplit(j0, 4)
+    k0 = ifuse([i0, i1])
+    k1 = ifuse([i2, i3])
+
+    # compound 1.1
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+
+    # assert_iter_sum_pattern
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.2
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.3
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 1.4
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], k0[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], (i0[0] * 2) + floordiv(j0[0], 4) < 7)
+    tvm.ir.assert_structural_equal(res[2][1], T.bool(True))
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.5
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]], k1[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], T.bool(True))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(j0[0], 4) * 2) + i3[0] < 7)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.6
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], tvm.tir.all(k0[0] < 7, k1[0] < 7))
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2
+    j0 = create_iter("j0", 4)
+    l0 = create_iter("l0", 2)
+    l1 = create_iter("l1", 6)
+    j3 = create_iter("j3", 3)
+
+    k0 = ifuse([l0, l1])
+    i1, j2 = isplit(k0, 3)
+    j1, i1 = isplit(i1, 2)
+    i0 = ifuse([j0, j1])
+    i2 = ifuse([j2, j3])
+
+    # compound 2.1
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.2
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], j0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(l0[0] * 6 + l1[0], 6))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(floormod(l0[0] * 6 + l1[0], 6), 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l0[0] * 6 + l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0])).indices
+    assert len(res2) == 3
+
+    # compound 2.3
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2.4
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0], i2[0]],
+        var_dom([j0, l0, l1, j3]),
+        [l1[0], j3[0]],
+        tvm.tir.all(i0[0] < 7, i2[0] < 8),
+    )
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+    tvm.ir.assert_structural_equal(res[3][0], (j0[0] * 2) + l0[0] < 7)
+    tvm.ir.assert_structural_equal(res[3][1], (floormod(l1[0], 3) * 3) + j3[0] < 8)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.5
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [j3[0]], i2[0] < 8)
+    res = convert_division(res)
+    assert len(res) == 0
+
+
+def test_subspace_divide_trivial_iters():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    # z = tvm.tir.Var("z", "int32")
+
+    # trivial 1.1
+    res = tvm.arith.subspace_divide([x * 16 + y], var_dom([(x, 1), (y, 16)]), [y], simplify_trivial_iterators=False)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], y)
+
+    # trivial 1.2
+    res = tvm.arith.subspace_divide(
+        [x, y],
+        var_dom([(x, 1), (y, 1)]),
+        [y],
+        simplify_trivial_iterators=False,
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], y)
+
+
+def test_complex():
+    n0 = create_iter("n0", 2)
+    n1 = create_iter("n1", 4)
+
+    m0 = ifuse([n0, n1], 6)
+    m1 = create_iter("m1", 3)
+
+    l0 = create_iter("l0", 4)
+    l1 = create_iter("l1", 8)
+    l2 = ifuse([m0, m1], 16)
+    l3 = create_iter("l3", 32)
+
+    k0, k4 = isplit(l0, 2)
+    k1, k5 = isplit(l1, 2)
+    k2, k6 = isplit(l2, 4)
+    k3, k7 = isplit(l3, 4)
+
+    j0 = ifuse([k0, k1], 7)
+    j1 = ifuse([k2, k3])
+    j2 = ifuse([k4, k5])
+    j3 = ifuse([k6, k7], 15)
+
+    i0 = ifuse([j0, j1], 200)
+    i1 = ifuse([j2, j3], 50)
+
+    n0_mark = tvm.arith.IterMark(n0[0], n0[1])
+    n1_mark = tvm.arith.IterMark(n1[0], n1[1])
+    l0_mark = tvm.arith.IterMark(l0[0], l0[1])
+    l1_mark = tvm.arith.IterMark(l1[0], l1[1])
+    m1_mark = tvm.arith.IterMark(m1[0], m1[1])
+    l3_mark = tvm.arith.IterMark(l3[0], l3[1])
+
+    m0_expr = tvm.arith.IterSumExpr(
+        [
+            tvm.arith.IterSplitExpr(n0_mark, 1, n0[1], 4),
+            tvm.arith.IterSplitExpr(n1_mark, 1, n1[1], 1),
+        ],
+        0,
+    )
+    m0_mark = tvm.arith.IterMark(m0_expr, 6)
+    l2_expr = tvm.arith.IterSumExpr(
+        [tvm.arith.IterSplitExpr(m0_mark, 1, 6, 3), tvm.arith.IterSplitExpr(m1_mark, 1, m1[1], 1)],
+        0,
+    )
+    l2_mark = tvm.arith.IterMark(l2_expr, 16)
+    k0_expr = tvm.arith.IterSplitExpr(l0_mark, 2, 2, 4)
+    k1_expr = tvm.arith.IterSplitExpr(l1_mark, 2, 4, 1)
+    k2_expr = tvm.arith.IterSplitExpr(l2_mark, 4, 4, 8)
+    k3_expr = tvm.arith.IterSplitExpr(l3_mark, 4, 8, 1)
+    k4_expr = tvm.arith.IterSplitExpr(l0_mark, 1, 2, 30)
+    k5_expr = tvm.arith.IterSplitExpr(l1_mark, 1, 2, 15)
+    k6_expr = tvm.arith.IterSplitExpr(l2_mark, 1, 4, 4)
+    k7_expr = tvm.arith.IterSplitExpr(l3_mark, 1, 4, 1)
+
+    j0_expr = tvm.arith.IterSumExpr([k0_expr, k1_expr], 0)
+    j0_mark = tvm.arith.IterMark(j0_expr, 7)
+    i0_expr = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(j0_mark, 1, 7, 32), k2_expr, k3_expr], 0)
+
+    j3_expr = tvm.arith.IterSumExpr([k6_expr, k7_expr], 0)
+    j3_mark = tvm.arith.IterMark(j3_expr, 15)
+    i1_expr = tvm.arith.IterSumExpr([k4_expr, k5_expr, tvm.arith.IterSplitExpr(j3_mark, 1, 15, 1)], 0)
+
+    i0_mark = tvm.arith.IterMark(i0_expr, i0[1])
+    i1_mark = tvm.arith.IterMark(i1_expr, i1[1])
+
+    i0_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i0_mark, 1, i0[1], 1)], 0)
+    i1_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i1_mark, 1, i1[1], 1)], 0)
+
+    assert_iter_sum_pattern(
+        {i0[0]: (200, 0, 1, i0_final), i1[0]: (50, 0, 1, i1_final)},
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        predicate=tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 9, l2[0] < 16, j0[0] < 7, j3[0] < 14),
+    )
+
+    # subspace_division
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        [n0[0], n1[0], m1[0], l3[0]],
+        tvm.tir.all(m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], floordiv(l0[0], 2) * 4 + floordiv(l1[0], 2))
+    tvm.ir.assert_structural_equal(res[0][1], (floordiv((n0[0] * 4 + n1[0]) * 3 + m1[0], 4) * 8) + floordiv(l3[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], ((floormod(l0[0], 2) * 2) + floormod(l1[0], 2)))
+    tvm.ir.assert_structural_equal(res[1][1], ((floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4) + floormod(l3[0], 4)))
+    tvm.ir.assert_structural_equal(res[2][0], (floordiv(l0[0], 2) * 4) + floordiv(l1[0], 2) < 7)
+    tvm.ir.assert_structural_equal(
+        res[2][1],
+        tvm.tir.all(
+            n0[0] * 4 + n1[0] < 6,
+            (n0[0] * 4 + n1[0]) * 3 + m1[0] < 16,
+            floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4 + floormod(l3[0], 4) < 15,
+        ),
+    )
+
+    assert_iter_sum_pattern({res[0][1]: (32, 0), res[1][1]: (15, 0)}, var_dom([n0, n1, m1, l3]), res[2][1])
+    assert_iter_sum_pattern({res[0][0]: (8, 0), res[1][0]: (4, 0)}, var_dom([l0, l1]))
+
+
+def test_normalize_iter_map_to_expr():
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([(x, 10), (y, 9)]))
+
+    tvm.ir.assert_structural_equal(
+        tvm.arith.normalize_iter_map_to_expr(res.indices[0]),
+        fld(y, 3) * 6 + fld(x, 5) * 3 + flm(y, 3),
+    )
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res.indices[1]), flm(x, 5))
+
+    # iter mark wrap a complex expr
+    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x * y + 1, 1024), 1, 1024, 1)
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x * y + 1)
+
+
+def test_inverse_affine_iter_map():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    l2 = create_iter("l2", 64)
+
+    # simple case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l0_1_l1_1_fused = ifuse([l0_1, l1_1])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 2
+    l0_inverse = floordiv(outputs[0], 4) + outputs[1] * 16
+    l1_inverse = floormod(outputs[0], 4) + outputs[2] * 4
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+
+    # compound case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l2_1, l2_2 = isplit(l2, 4)
+    l2_0, l2_1 = isplit(l2_1, 4)
+
+    l0_1_l2_1_l1_1_l2_0_fused = ifuse([l0_1, l2_1, l1_1, l2_0])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l2_1_l1_1_l2_0_fused[0], l0_0[0], l2_2[0], l1_0[0]], var_dom([l0, l1, l2])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 3
+    l0_inverse = floordiv(outputs[0], 64) + outputs[1] * 16
+    l1_inverse = floormod(floordiv(outputs[0], 4), 4) + outputs[3] * 4
+    l2_inverse = floormod(outputs[0], 4) * 16 + floormod(floordiv(outputs[0], 16), 4) * 4 + outputs[2]
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+    assert analyzer.can_prove_equal(res[l2[0]], l2_inverse)
+
+    # diamond-shape DAG
+    l0_0, l0_1 = isplit(l0, 16)
+    l1 = ifuse([l0_1, l0_0])
+    l1_0, l1_1 = isplit(l1, 8)
+    l2 = ifuse([l1_1, l1_0])
+
+    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 1
+    l1_inverse = floormod(outputs[0], 8) * 8 + floordiv(outputs[0], 8)
+    l0_inverse = floormod(l1_inverse, 4) * 16 + floordiv(l1_inverse, 4)
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+
+
+def test_inverse_affine_map_trivial_iter():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    iter_map = tvm.arith.detect_iter_map([0, l0[0], l1[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    # output_0 is expected to be constant and it is not included in the inverse map
+    assert len(res) == 2
+    assert analyzer.can_prove_equal(res[l0[0]], outputs[1])
+    assert analyzer.can_prove_equal(res[l1[0]], outputs[2])
+
+
+def test_free_variables():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # illegal iter if z is within dom
+    assert_iter_sum_failure([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
+
+    # iter is valid if z is free, even there are linear forms of z
+    assert_iter_sum_pattern(
+        {z * 19 + y * 3 + x: (9, z * 19)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+    assert_iter_sum_pattern(
+        {z * z + y * 3 + x: (9, z * z)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+
+
+class TestPadding:
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    positive_test_case = tvm.testing.parameter(
+        # left padding only, offset divisible
+        ({y: 192}, {fld(64 + y, 32): (6, 2, 1), flm(64 + y, 32): (32, 0, 1)}, "bijective"),
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32): (6, 2, 1)}),
+        ({y: 176}, {flm(fld(80 + y, 2), 16): (16, 0, 1), flm(80 + y, 2): (2, 0, 1)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 16): (10, 0, 1), flm(x * 32 + y * 8, 16): (2, 0, 8)}),
+        # right padding only, offset non-divisible
+        ({x: 26}, {fld(x, 15): (2, 0, 1)}),
+        ({x: 26}, {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)}),
+        # padding constants on both side
+        ({x: 45}, {fld(x + 71, 32): (2, 2, 1)}),
+        ({x: 45}, {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)}),
+        # padding for free iteration part
+        ({y: 360}, {fld(x * 360 + y, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}),
+        ({y: 360}, {flm(x * 360 + y, 16): (16, 0, 1)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3): (3, 0),
+                flm(fld(x + 10, 3), 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
+            },
+        ),
+        # different offsets on splits
+        (
+            {x: 240},
+            {
+                flm(x + 1, 3): (3, 0),
+                flm(fld(x + 10, 3) + 2, 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
+            },
+        ),
+    )
+
+    negative_test_case = tvm.testing.parameter(
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32), flm(80 + y, 32)}),
+        ({y: 176}, {fld(80 + y, 32), fld(80 + y, 4)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 5)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3),
+                flm(fld(x + 10, 3), 4),
+                flm(fld(fld(x + 10, 3), 4), 5),
+                fld(fld(fld(x + 10, 3), 4), 5),
+            },
+        ),
+        # original extent is smaller than the divident
+        # it is not surjective wrt to the region [0, 16)
+        ({x: 3}, {flm(x, 16)}),
+        # (x % c1) // c2 is not proved as surjective if c1 % c2 != 0
+        ({x: 255}, {fld(flm(x, 255), 16)}),
+    )
+
+    def test_padding(self, positive_test_case):
+        iter_extent, mapped_iterators, *args = positive_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_pattern(mapped_iterators, dom_map, check_level=check_level)
+
+    def test_padding_error(self, negative_test_case):
+        iter_extent, mapped_iterators, *args = negative_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_failure(mapped_iterators, dom_map, check_level=check_level)
+
+
+def test_overlapped_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    a = tvm.tir.Var("x", "int32")
+    b = tvm.tir.Var("y", "int32")
+
+    # non-bijective fuse of two
+    assert_iter_sum_pattern(
+        {
+            x * 7 + y: (22, 0, 1),
+        },
+        var_dom([(x, 3), (y, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 3), (y, 8)]), check_level="bijective")
+
+    # non-bijective fuse of three
+    assert_iter_sum_pattern(
+        {
+            x * 18 + y * 7 + z: (40, 0, 1),
+        },
+        var_dom([(x, 2), (y, 3), (z, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 2), (y, 3), (z, 8)]), check_level="bijective")
+
+    # negative scale fusion is not allowed
+    assert_iter_sum_failure([x * -7 + y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+    assert_iter_sum_failure([x * 7 - y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+
+    # with predicate
+    assert_iter_sum_pattern(
+        {
+            a * 40 + b * 20 + x * 18 + y * 3 + z: (125, 6, 1),
+        },
+        var_dom([(a, 3), (b, 2), (x, 2), (y, 6), (z, 8)]),
+        predicate=tvm.tir.all(z < 4, x * 6 + y > 1, x * 6 + y < 10),
+        check_level="surjective",
+    )
+
+    # stride=1 kernel
+    assert_iter_sum_pattern({x + a: (230, 0, 1)}, var_dom([(x, 224), (a, 7)]), check_level="surjective")
+
+    # do not allow both strided and overlapped
+    assert_iter_sum_failure([5 * x + 2 * y], var_dom([(x, 4), (y, 3)]), check_level="surjective")
+
+
+def test_iter_map_simplify_symbolic_case():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = x * 32 + y
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    assert_iter_map_simplify({simple_fuse0(x): x}, var_dom([(x, n * 32)]))
+
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, n), (y, 32)]))
+
+    def fsymbolic_fuse0(x):
+        return ((x // (n * n)) % 32) * (n * n) + ((x // n) % n) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse0(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse0(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse1(x):
+        return ((x % (n * n * 32)) // (n * n) * n + (x % (n * n) // n)) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse1(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse1(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    assert_iter_map_simplify({fsymbolic_fuse2(x): x}, var_dom([(x, n * n * 32)]))
+
+
+def test_iter_map_simplify_symbolic_predicate():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    z = x * 32 + y
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, (n + 1) // 2), (y, 32)]), predicate=(z < n * 16))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    z = x * 64 + y
+    assert_iter_map_simplify(
+        {fsymbolic_fuse2(z): z},
+        var_dom([(x, (n * n + 1) // 2), (y, 64)]),
+        predicate=(z < n * n * 32),
+    )
+
+
+def test_iter_map_simplify_symbolic_reshape():
+    n = tvm.tir.Var("n", "int64")
+    fused = tvm.tir.Var("fused", "int64")
+
+    ax0 = (fused // 4096) // n
+    ax1 = (fused // 4096) % n
+    ax2 = fused % 4096
+
+    rhs_index = ((ax2 // 4096 + ax0 * n + ax1) % n) * 4096 + ax2 % 4096
+
+    assert_iter_map_simplify({rhs_index: fused}, var_dom([(fused, n * 4096)]))
+
+
+def test_iter_map_simplify_unit_loop_order():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+
+    # trivial iterators can be found at any when comparing via scale
+    # ensure order unchange
+    assert_iter_map_simplify({x + y + z: x + y + z}, var_dom([(x, 1), (y, 1), (z, 1)]), simplify_trivial_iterators=False)
+
+    # Even with simplification, it should follow the original order
+    assert_iter_map_simplify(
+        {x + y + (z // 4) * 4 + z % 4: z + x + y},
+        var_dom([(x, 1), (y, 1), (z, 32)]),
+        simplify_trivial_iterators=False,
+    )
+
+    assert_iter_map_simplify(
+        {y + 64 - x % 2 * 64: y + 64 - x % 2 * 64},
+        var_dom([(x, 6), (y, 64)]),
+        simplify_trivial_iterators=False,
+    )
+
+    # When we have iterators that have same scale but one of them come
+    # with unit extent, we should prioritize unit extent
+    assert_iter_map_simplify(
+        {x // 128 + y + z: y + z},
+        var_dom([(x, 128), (y, 128), (z, 1)]),
+        simplify_trivial_iterators=False,
+    )
+
+
+def assert_normalize_to_iter_sum(index, input_iters, args, base):
+    """Assert the result of arith.normalize_to_iter_sum is correct
+
+    Parameters
+    ----------
+    index : tvm.tir.PrimExpr
+        The index to be normalized
+    input_iters : Mapping[Var, Range]
+        The input iterators
+    args : List[Union[tvm.arith.IterSplitExpr, Tuple[PrimExpr, PrimExpr]]]
+        The expected result. Ordered list of args of the expected IterSumExpr. Each arg can be
+        either IterSplitExpr or a tuple of (PrimExpr, PrimExpr) where the first element is the
+        iterator normalized to PrimExpr and the second element is the scale.
+    base : tvm.tir.PrimExpr
+        The expected base
+    """
+    res = tvm.arith.normalize_to_iter_sum(index, input_iters)
+
+    assert isinstance(res, tvm.arith.IterSumExpr)
+    assert len(res.args) == len(args)
+    for split, item in zip(res.args, args):
+        if isinstance(item, tvm.arith.IterSplitExpr):
+            tvm.ir.assert_structural_equal(split, item)
+            continue
+        tvm.testing.assert_prim_expr_equal(split.scale, item[1])
+        tvm.testing.assert_prim_expr_equal(tvm.arith.normalize_iter_map_to_expr(split), item[0] * item[1])
+    tvm.testing.assert_prim_expr_equal(res.base, base)
+
+
+def test_normalize_to_iter_sum():
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+    a = tvm.tir.Var("a", "int64")
+    n = tvm.tir.Var("n", "int64")
+    # flm = tvm.tir.floormod
+
+    assert_normalize_to_iter_sum(
+        z + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n), (z, 1)],
+        2 * n + 3,
+    )
+
+    # max cannot detected so it goes into base
+    assert_normalize_to_iter_sum(
+        tvm.tir.max(z, a) + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n)],
+        tvm.tir.max(z, a) + 2 * n + 3,
+    )
+
+    # order by symbolic prod
+    assert_normalize_to_iter_sum(
+        z + ((y * 4 * a + x * 4 + 2) * n) + 3,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, a * n * 4), (x, n * 4), (z, 1)],
+        2 * n + 3,
+    )
+
+    # order by cscale
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * x,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x, 4), (z, 1)],
+        0,
+    )
+
+    # split pattern
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * (x // 2),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x // 2, 4), (z, 1)],
+        0,
+    )
+
+    # non-divisible
+    assert_normalize_to_iter_sum(
+        x // 5,
+        var_dom([(x, 4096)]),
+        [
+            tvm.arith.IterSplitExpr(
+                tvm.arith.IterMark(x, 4096),
+                lower_factor=tvm.tir.const(5, "int64"),
+                extent=tvm.tir.const(820, "int64"),
+                scale=tvm.tir.const(1, "int64"),
+            )
+        ],
+        0,
+    )
+
+    # iter simplify
+    assert_normalize_to_iter_sum(
+        z * 2 + 2 * y * 3 + 4 * (x // 4) + (x % 4),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (z, 2), (x, 1)],
+        0,
+    )
+
+
+def test_detect_iter_map_with_bufferload_recursion():
+    n = tvm.tir.Var("n", "int32")
+    m = tvm.tir.Var("m", "int32")
+    divisor = tvm.tir.Var("divisor", "int32")
+
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+
+    buffer = tvm.tir.decl_buffer((n,), "int32", name="seqlen")
+
+    indices = [(buffer[i] + j) // divisor]
+    iter_vars = {
+        i: tvm.ir.Range(tvm.tir.const(0, "int32"), n),
+        j: tvm.ir.Range(tvm.tir.const(0, "int32"), m),
+    }
+
+    result = tvm.arith.detect_iter_map(indices, iter_vars)
+    assert len(result.indices) == 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_simplify.py b/testing/python/arith/test_arith_simplify.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6cf6d3df74e0e6ddb15f463b96712f07e87b95
--- /dev/null
+++ b/testing/python/arith/test_arith_simplify.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tilelang import tvm
+import tilelang.testing
+from tvm import tir
+import tvm.ir
+
+
+def test_simplify_reshape_flattened_index():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    ana.bind(i0, tvm.ir.Range(0, 8))
+    ana.bind(i1, tvm.ir.Range(0, 3))
+
+    i_flattened = i0 * 3 + i1
+    tvm.ir.assert_structural_equal(
+        ana.simplify((i_flattened) // 12 * 12 + (i_flattened) % 12 // 4 * 4 + (i_flattened) % 4),
+        i_flattened,
+    )
+
+
+dtype = tvm.testing.parameter(
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+)
+
+
+def test_can_prove_self_identity(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove(n == n)
+
+
+def test_can_prove_self_equal_to_self(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove_equal(n, n)
+
+
+def test_simplify_symbolic_comparison():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    n, m = tvm.tir.SizeVar("n", "int64"), tvm.tir.SizeVar("m", "int64")
+    outer = (n + 31) // 32
+    ana.bind(i0, tvm.ir.Range(0, outer))
+    ana.bind(i1, tvm.ir.Range(0, 32))
+    PS = tvm.arith.ProofStrength
+
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32 + m, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 + 1 <= (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1 + 1, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1, PS.SYMBOLIC_BOUND)
+
+
+def test_regression_simplify_inf_recursion():
+    ana = tvm.arith.Analyzer()
+    cond = tir.Var("cond", "int32")
+
+    res = (tvm.tir.NE(cond, 0).astype("int8") - tvm.tir.NE(cond, 0).astype("int8")).astype("int32") == 0
+    # regression in a previous case
+    # try compare and int set recursive call can cause infinite loop
+    ana.rewrite_simplify(res)
+
+
+def test_simplify_floor_mod_with_linear_offset():
+    """
+    Test that the floor_mod is simplified correctly when the offset is linear.
+    """
+    ana = tvm.arith.Analyzer()
+    past_decoder_sequence_length = tir.Var("past_decoder_sequence_length", "int64")
+    expr1 = (past_decoder_sequence_length + 1) * 64
+    divisor1 = (past_decoder_sequence_length + 1) * 32
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor1), 0)
+    divisor2 = 32 * (past_decoder_sequence_length + 1)
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor2), 0)
+
+
+def test_simplify_float_division():
+    # Test for the discussion:
+    # https://discuss.tvm.apache.org/t/discuss-is-constant-division-to-multiplication-rewrite-in-tvm-necessary/18615
+    ana = tvm.arith.Analyzer()
+    x = tir.Var("x", "float32")
+    ry = x / 27
+    # in old version, the division will be rewritten into x * T.float32(1 / 27)
+    sy = ana.rewrite_simplify(ry)
+    tvm.ir.assert_structural_equal(ry, sy)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/autotune/test_tilelang_autotune.py b/testing/python/autotune/test_tilelang_autotune.py
index 85e2e48077a06425c3a47cf6b4306a6afca56369..53707ca3418ef848595beb9e4b04d5e3479c5387 100644
--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -48,6 +48,7 @@ def get_configs(M, N, K, with_roller=False):
         from tilelang.carver.template import MatmulTemplate
         from tilelang.carver.arch import CUDA
         from tilelang.carver.roller.rasterization import NoRasterization
+
         arch = CUDA("cuda")
         topk = 20
 
@@ -56,9 +57,9 @@ def get_configs(M, N, K, with_roller=False):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -84,7 +85,6 @@ def get_configs(M, N, K, with_roller=False):
         for config in configs:
             print(config)
     else:
-
         block_M = [64]
         block_N = [64]
         block_K = [32]
@@ -100,7 +100,8 @@ def get_configs(M, N, K, with_roller=False):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -110,7 +111,8 @@ def get_configs(M, N, K, with_roller=False):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
@@ -185,14 +187,14 @@ def matmul(M, N, K, with_roller):
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
-        accum_dtype = "float"
+        dtype = T.float16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -206,9 +208,7 @@ def matmul(M, N, K, with_roller):
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -247,12 +247,16 @@ def matmul(M, N, K, with_roller):
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
+        )
+        .set_profile_args(
+            ref_prog=ref_program,
+        )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
diff --git a/testing/python/autotune/test_tilelang_autotune_with_inputs.py b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
index 39efce6bf4af6d5696c81ba2e9af34c220fb38e0..4edea0b8839342e97e53eeb0c53558ab8ddd1ba6 100644
--- a/testing/python/autotune/test_tilelang_autotune_with_inputs.py
+++ b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
@@ -30,38 +30,23 @@ def ref_program(A, B):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64],
-        block_N=[64],
-        block_K=[32],
-        num_stages=[0, 1],
-        thread_num=[128],
-        enable_rasterization=[False])
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
+    iter_params = dict(block_M=[64], block_N=[64], block_K=[32], num_stages=[0, 1], thread_num=[128], enable_rasterization=[False])
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M=128,
-           block_N=128,
-           block_K=32,
-           num_stages=0,
-           thread_num=128,
-           enable_rasterization=False):
-
-    dtype = "float16"
-    accum_dtype = "float"
+def matmul(M, N, K, block_M=128, block_N=128, block_K=32, num_stages=0, thread_num=128, enable_rasterization=False):
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -76,7 +61,6 @@ def matmul(M,
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/testing/python/cache/test_tilelang_cache_matmul.py b/testing/python/cache/test_tilelang_cache_matmul.py
deleted file mode 100644
index 6e966a88af8c42dcc5a19ebcff7020c336c5707d..0000000000000000000000000000000000000000
--- a/testing/python/cache/test_tilelang_cache_matmul.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.testing
-from tilelang.cache import cached
-import tilelang.language as T
-
-
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    """
-    Defines a matrix multiplication primitive function using tilelang.
-
-    This function constructs a tilelang primitive function for matrix multiplication,
-    optimized for execution on hardware accelerators. It utilizes shared memory and
-    fragment memory for performance.
-
-    Args:
-        M (int): Number of rows in matrix A and C.
-        N (int): Number of columns in matrix B and C.
-        K (int): Number of columns in matrix A and rows in matrix B.
-        block_M (int): Block size for M dimension in shared memory and fragment.
-        block_N (int): Block size for N dimension in shared memory and fragment.
-        block_K (int): Block size for K dimension in shared memory.
-        dtype (str, optional): Data type for input matrices A and B, and output C. Defaults to "float16".
-        accum_dtype (str, optional): Accumulation data type for internal computations. Defaults to "float".
-
-    Returns:
-        T.PrimFunc: A tilelang primitive function representing the matrix multiplication.
-    """
-
-    @T.prim_func
-    def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_cache_matmul():
-    """
-    Demonstrates the usage of the cached matrix multiplication kernel.
-
-    This function defines a reference PyTorch matrix multiplication,
-    creates a cached kernel from the tilelang matmul function,
-    runs the kernel with random input tensors, compares the output with the reference,
-    and prints the CUDA kernel source code.
-    """
-
-    def ref_program(A, B):
-        """
-        Reference PyTorch matrix multiplication for comparison.
-        """
-        import torch
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.half)  # Assuming dtype="float16" in matmul
-        return C
-
-    func = matmul(1024, 1024, 1024, 128, 128, 32)
-    kernel = cached(func, [2], execution_backend="cython")
-    import torch
-
-    a = torch.randn(1024, 1024).cuda().half()
-    b = torch.randn(1024, 1024).cuda().half()
-
-    c = kernel(a, b)
-    print("\nOutput from Cached Kernel:")
-    print(c)
-
-    ref_c = ref_program(a, b)
-    print("\nReference PyTorch Output:")
-    print(ref_c)
-
-    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-    print("\nOutputs are close (within tolerance).")
-
-    # Get CUDA Source
-    print("\nCUDA Kernel Source:")
-    print(kernel.get_kernel_source())
-
-
-def test_cache_matmul_f16f16f16_nn():
-    """
-    Test function for cached matrix multiplication (float16 inputs, float16 output, no transpose).
-    """
-    run_cache_matmul()
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
new file mode 100644
index 0000000000000000000000000000000000000000..67d20b89790afc81e2b2b5a71795a408e700cdaf
--- /dev/null
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -0,0 +1,72 @@
+import tilelang.testing
+from tilelang.carver.arch.driver.cuda_driver import (
+    get_cuda_device_properties,
+    get_device_name,
+    get_shared_memory_per_block,
+    get_device_attribute,
+    get_max_dynamic_shared_size_bytes,
+    get_persisting_l2_cache_max_size,
+    get_num_sms,
+    get_registers_per_block,
+)
+import torch
+
+
+class _cudaDeviceAttrNames:
+    r"""
+    This struct carries all properties that are of int32_t.
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxSharedMemoryPerBlock: int = 8
+    cudaDevAttrMaxRegistersPerBlock: int = 12
+    cudaDevAttrMultiProcessorCount: int = 16
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+def test_driver_get_device_properties():
+    prop = get_cuda_device_properties()
+    assert prop is not None, "Failed to get CUDA device properties"
+    assert isinstance(prop, torch.cuda._CudaDeviceProperties), "Returned object is not of type _CudaDeviceProperties"
+
+
+def test_device_get_device_name():
+    tl_device_name = get_device_name()
+    th_device_name = torch.cuda.get_device_name()
+    assert tl_device_name == th_device_name, "Device names do not match"
+
+
+def test_device_get_shared_memory_per_block():
+    tl_smem = get_shared_memory_per_block()
+    driver_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerBlock)
+    assert tl_smem == driver_smem, "Shared memory per block values do not match"
+
+
+def test_device_get_persisting_l2_cache_size():
+    tl_cache_size = get_persisting_l2_cache_max_size()
+    driver_cache_size = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
+    assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"
+
+
+def test_device_get_num_sms():
+    tl_num_sms = get_num_sms()
+    driver_num_sms = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMultiProcessorCount)
+    assert tl_num_sms == driver_num_sms, "Number of SMs do not match"
+
+
+def test_device_get_registers_per_block():
+    tl_regs_per_block = get_registers_per_block()
+    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
+    assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
+
+
+def test_device_get_max_dynamic_shared_size_bytes():
+    tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
+    driver_dynamic_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
+    assert tl_dynamic_smem == driver_dynamic_smem, "Max dynamic shared size bytes values do not match"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/carver/test_tilelang_carver_generate_hints.py b/testing/python/carver/test_tilelang_carver_generate_hints.py
index 43cdb27e3ff7158294fb1034cc476588b183f390..ea674f7c743a0ba5e21bc4bafdd169fec2c1f85c 100644
--- a/testing/python/carver/test_tilelang_carver_generate_hints.py
+++ b/testing/python/carver/test_tilelang_carver_generate_hints.py
@@ -3,22 +3,20 @@ from tilelang import carver
 from tilelang.carver.roller import PrimFuncNode, OutputNode, Edge
 from tilelang.carver.arch import auto_infer_current_arch
 from tvm import te
+from tilelang.language import dtypes as T
 
 
 def run_general_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
@@ -29,8 +27,7 @@ def run_general_matmul_emit_configs(M, N, K, topk: int = 20):
 
     tensorized_func, tags = carver.utils.get_tensorized_func_and_tags(func, arch.target)
     print(tags)
-    policy = carver.TensorCorePolicy.from_prim_func(
-        func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
+    policy = carver.TensorCorePolicy.from_prim_func(func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
 
     hints = policy.emit_config(topk=topk)
 
@@ -59,16 +56,13 @@ def run_general_matmul_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
diff --git a/testing/python/carver/test_tilelang_carver_recommend_hints.py b/testing/python/carver/test_tilelang_carver_recommend_hints.py
index fee46761f21e84407d28749dcbd71f974c280053..3a060f5323d75f4727ae25aefe9a40608cca27cb 100644
--- a/testing/python/carver/test_tilelang_carver_recommend_hints.py
+++ b/testing/python/carver/test_tilelang_carver_recommend_hints.py
@@ -1,13 +1,11 @@
 import tilelang.testing
 from tilelang import carver
+from tilelang.language import dtypes as T
 from tilelang.carver.arch import auto_infer_current_arch
 from typing import List
 
 
-def run_general_reduction_recommend_hints(structure: str = "SSR",
-                                          shape: List[int] = None,
-                                          dtype: str = "float16",
-                                          topk: int = 20):
+def run_general_reduction_recommend_hints(structure: str = "SSR", shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.GeneralReductionTemplate(
         structure=structure,
@@ -23,14 +21,12 @@ def run_general_reduction_recommend_hints(structure: str = "SSR",
 
 
 def test_general_reduction_recommend_hints():
-    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SS", [1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], "float16")
+    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SS", [1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], T.float16)
 
 
-def run_elementwise_recommend_hints(shape: List[int] = None,
-                                    dtype: str = "float16",
-                                    topk: int = 20):
+def run_elementwise_recommend_hints(shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.ElementwiseTemplate(
         shape=shape,
@@ -45,18 +41,18 @@ def run_elementwise_recommend_hints(shape: List[int] = None,
 
 
 def test_elementwise_recommend_hints():
-    run_elementwise_recommend_hints([1024, 1024], "float16")
-    run_elementwise_recommend_hints([1024], "float16")
-    run_elementwise_recommend_hints([1024, 1024, 1024], "float16")
+    run_elementwise_recommend_hints([1024, 1024], T.float16)
+    run_elementwise_recommend_hints([1024], T.float16)
+    run_elementwise_recommend_hints([1024, 1024, 1024], T.float16)
 
 
 def run_matmul_recommend_hints(
     M: int = 1024,
     N: int = 1024,
     K: int = 1024,
-    in_dtype: str = "float16",
-    out_dtype: str = "float16",
-    accum_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.MatmulTemplate(
@@ -76,16 +72,14 @@ def run_matmul_recommend_hints(
 
 
 def test_matmul_recommend_hints():
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float16", "float16")
-    run_matmul_recommend_hints(1024, 1024, 1024, "int8", "int32", "int32")
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float32", "float16")
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float16, T.float16)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.int8, T.int32, T.int32)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float32, T.float16)
 
 
-def run_gemv_recommend_hints(N: int = 1024,
-                             K: int = 1024,
-                             in_dtype: str = "float16",
-                             out_dtype: str = "float16",
-                             accum_dtype: str = "float16"):
+def run_gemv_recommend_hints(
+    N: int = 1024, K: int = 1024, in_dtype: T.dtype = T.float16, out_dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float16
+):
     arch = auto_infer_current_arch()
     carve_template = carver.GEMVTemplate(
         N=N,
@@ -103,9 +97,9 @@ def run_gemv_recommend_hints(N: int = 1024,
 
 
 def test_gemv_recommend_hints():
-    run_gemv_recommend_hints(1024, 1024, "float16", "float16", "float16")
-    run_gemv_recommend_hints(1024, 1024, "int8", "int32", "int32")
-    run_gemv_recommend_hints(1024, 1024, "float16", "float32", "float16")
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float16, T.float16)
+    run_gemv_recommend_hints(1024, 1024, T.int8, T.int32, T.int32)
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float32, T.float16)
 
 
 def run_fmha_recommend_hints(
@@ -114,9 +108,9 @@ def run_fmha_recommend_hints(
     seq_length: int = 512,
     seq_kv_length: int = 512,
     head_dim: int = 128,
-    in_dtype: str = "float16",
-    accum_dtype: str = "float16",
-    out_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.FlashAttentionTemplate(
@@ -140,8 +134,8 @@ def run_fmha_recommend_hints(
 
 
 def test_fmha_recommend_hints():
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "float16", "float16", "float16")
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "int8", "int32", "int32")
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.float16, T.float16, T.float16)
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/components/test_cuda_restrict_codegen.py b/testing/python/components/test_cuda_restrict_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff8b3b192be844e05a6c2f9a905bd55c251561b
--- /dev/null
+++ b/testing/python/components/test_cuda_restrict_codegen.py
@@ -0,0 +1,48 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def _get_sig_line(code: str) -> str:
+    # Find the kernel signature line in generated CUDA code
+    for line in code.splitlines():
+        line = line.strip()
+        if line.startswith('extern "C" __global__ void'):
+            return line
+    raise AssertionError("Kernel signature not found in generated code")
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_default_has_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        with T.Kernel(N, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    artifact = tilelang.lower(kernel, target="cuda")
+    sig = _get_sig_line(artifact.kernel_source)
+    # By default, kNoAlias is set and both pointers are restrict-qualified
+    assert "__restrict__" in sig
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_annotation_removes_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel_body_annot(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        # Explicitly mark buffers that may alias as non-restrict
+        with T.Kernel(N, threads=32) as pid:
+            T.annotate_restrict_buffers(x, y)
+            y[pid] = x[pid] + 1.0
+
+    art1 = tilelang.lower(kernel_body_annot, target="cuda")
+    sig1 = _get_sig_line(art1.kernel_source)
+    # No parameter should be emitted with __restrict__
+    assert "__restrict__" not in sig1
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
index 1d60708fe00e0e482c2dbce7e8643bd2063c49a1..4c4f4e5f3df8ccf79d973eb6bc62af757a79b609 100644
--- a/testing/python/components/test_storage_rewrite_detect_inplace.py
+++ b/testing/python/components/test_storage_rewrite_detect_inplace.py
@@ -8,12 +8,12 @@ def _compile_kernel_without_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
@@ -23,17 +23,18 @@ def _compile_kernel_without_inplace():
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_STORAGE_REWRITE_DETECT_INPLACE: True,
-    },)
+    },
+)
 def _compile_kernel_with_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
diff --git a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
index 499f3346bcca1bc9d927a7b5c13efce83aa1f01e..d599e581ac24c30afe8831a7ca2b114991626228 100644
--- a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
+++ b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
@@ -1,5 +1,6 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
+import torch
 
 
 def matmul(
@@ -22,13 +23,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -88,12 +87,11 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: disable_warp_specialized,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
-        import torch
-
         if trans_A:
             A = A.T
         if trans_B:
@@ -113,9 +111,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -128,9 +126,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py
index 0129b37314e26aa33891922ee8cc4f3b94331147..55646622e3d22057416278ae5215f2626ccdd287 100644
--- a/testing/python/cpu/test_tilelang_cpu_gemm.py
+++ b/testing/python/cpu/test_tilelang_cpu_gemm.py
@@ -5,14 +5,14 @@ import tilelang.language as T
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 0
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
             A_local = T.alloc_local((block_M, block_K), dtype)
@@ -31,7 +31,6 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
             # )
 
             for ko in T.Pipelined(K // block_K, num_stages=num_stages):
-
                 T.copy(A[by * block_M, ko * block_K], A_local)
 
                 # Or Copy with Parallel
@@ -62,14 +61,13 @@ def test_matmul_codegen():
 
 
 def test_matmul_compile():
-
-    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
         # a simple kernel just for jit test
         @T.prim_func
         def matmul(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((K, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
                 A_local = T.alloc_local((block_M, block_K), dtype)
@@ -105,7 +103,7 @@ def test_matmul_compile():
     with tvm.target.Target("c"):
         complied_fun = tilelang.compile(cpu_func, -1, execution_backend="ctypes")
 
-    in_dtype = "float16"
+    in_dtype = T.float16
     A = torch.randn(M, K, dtype=torch.__getattribute__(in_dtype))
     B = torch.randn(K, N, dtype=torch.__getattribute__(in_dtype))
 
diff --git a/testing/python/debug/test_device_assert.py b/testing/python/debug/test_device_assert.py
index 1602c30c75a03393b8eb6eae986fbc8f98e7d76f..210b8966d7b4acced3a1adc62725a3e579e14900 100644
--- a/testing/python/debug/test_device_assert.py
+++ b/testing/python/debug/test_device_assert.py
@@ -7,7 +7,6 @@ import tilelang.language as T
 # TODO(dyq) It intentionally triggers a device-side assert so we can't include this in CI
 # Please run manually when you want to verify that device_assert actually traps on GPU.
 def _manual_device_assert_triggered():
-
     @T.prim_func
     def program():
         with T.Kernel(threads=128):
@@ -20,7 +19,6 @@ def _manual_device_assert_triggered():
 
 
 def test_device_assert_no_trigger():
-
     @T.prim_func
     def program():
         with T.Kernel(threads=128):
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index 1bc7616197387ae9ca107780d90d907c2a1b157d..3483cffc0eb7aeb99669a716d83dd1610a66f27f 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -5,27 +5,41 @@ import tilelang.testing
 import tilelang.language as T
 
 
-def debug_print_buffer(M=16, N=16, dtype="float16"):
-
+def debug_print_buffer(M=16, N=16, dtype=T.float16):
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
         with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
             shared_buf = T.alloc_shared([M, N], dtype)
             T.print(shared_buf)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program, target="cuda", execution_backend="tvm_ffi")
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
 
 def test_debug_print_buffer():
-    debug_print_buffer(16, 16, dtype="float")
-    debug_print_buffer(16, 16, dtype="float16")
-    debug_print_buffer(16, 16, dtype="uint8")
+    debug_print_buffer(dtype=T.bool)
+    debug_print_buffer(dtype=T.int8)
+    debug_print_buffer(dtype=T.int16)
+    debug_print_buffer(dtype=T.int32)
+    debug_print_buffer(dtype=T.int64)
+    debug_print_buffer(dtype=T.uint8)
+    debug_print_buffer(dtype=T.uint16)
+    debug_print_buffer(dtype=T.uint32)
+    debug_print_buffer(dtype=T.uint64)
+    debug_print_buffer(dtype=T.float16)
+    debug_print_buffer(dtype=T.float32)
+    debug_print_buffer(dtype=T.float64)
+    debug_print_buffer(dtype=T.bfloat16)
+    debug_print_buffer(dtype=T.float8_e4m3fn)
+    debug_print_buffer(dtype=T.float8_e4m3fn)
+    debug_print_buffer(dtype=T.float8_e4m3fnuz)
+    debug_print_buffer(dtype=T.float8_e5m2)
+    debug_print_buffer(dtype=T.float8_e5m2fnuz)
 
 
 def debug_print_buffer_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -45,7 +59,7 @@ def test_debug_print_buffer_conditional():
 
 
 def debug_print_value_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -64,7 +78,7 @@ def test_debug_print_value_conditional():
 
 
 def debug_print_register_files(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -83,7 +97,7 @@ def test_debug_print_register_files():
 
 
 def debug_print_msg(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
deleted file mode 100644
index 07f4d7847a0914f31a91a2ded80180be1404bb61..0000000000000000000000000000000000000000
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
+++ /dev/null
@@ -1,518 +0,0 @@
-import torch
-import torch.backends
-from tilelang import tvm as tvm
-import tilelang.testing
-from tvm import DataType
-import tilelang.language as T
-from tilelang.intrinsics.utils import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import (TensorCoreIntrinEmitter)
-
-tilelang.testing.set_random_seed(0)
-
-
-def make_swizzle_layout(shared_buf):
-    dtype = shared_buf.dtype
-    shape = shared_buf.shape
-
-    can_swizzle = shape[-1] * DataType(dtype).bits == 512
-    if not can_swizzle:
-        return T.Layout(shape, lambda *args: args)
-
-    def transform_func(i, j):
-        new_warp_i, new_warp_j = get_swizzle_layout(i, j, shape[-1], dtype)
-        return [new_warp_i, new_warp_j]
-
-    return T.Layout(shape, transform_func)
-
-
-def tl_matmul_macro(
-    N,
-    K,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-):
-    assert in_dtype in [
-        "float16",
-        "int8",
-    ], "Currently only float16 and int8 are supported"
-    assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
-    ], "Currently only float16, float32 and int32 are supported"
-
-    micro_size_x = micro_size_y = micro_size_k = 16
-
-    if out_dtype == "int32":
-        micro_size_k = 32
-
-    # This is a debug config
-    block_row_warps = 1
-    block_col_warps = 1
-    warp_row_tiles = 16
-    warp_col_tiles = 16
-    chunk = 32 if in_dtype == "float16" else 64
-    shared_scope = "shared.dyn"
-
-    # Pipeline Stage
-    stage = 2
-
-    block_M = block_row_warps * warp_row_tiles
-    block_N = block_col_warps * warp_col_tiles
-    block_K = chunk
-
-    M = tvm.te.var("m")
-    A_shape = (M, K)
-    B_shape = (N, K)
-    A_shared_shape = (block_M, block_K)
-    B_shared_shape = (block_N, block_K)
-    C_shared_shape = (
-        block_M // micro_size_x,
-        block_N // micro_size_y,
-        micro_size_x,
-        micro_size_y,
-    )
-
-    warp_size = 32
-    threads = warp_size * (block_row_warps * block_col_warps)
-    local_size = (micro_size_x * micro_size_y) // warp_size
-    warp_rows = warp_row_tiles // micro_size_x
-    warp_cols = warp_col_tiles // micro_size_y
-
-    # MMA Wrapper to Auto Generate Code for MMA
-    mma_emitter = TensorCoreIntrinEmitter(
-        a_dtype=in_dtype,
-        b_dtype=in_dtype,
-        accum_dtype=accum_dtype,
-        a_transposed=False,
-        b_transposed=True,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-    )
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
-            A_local = T.alloc_local((warp_rows * local_size), in_dtype)
-            B_local = T.alloc_local((warp_cols * local_size), in_dtype)
-            C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
-
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
-
-            # Improve L2 Cache
-            T.use_swizzle(panel_size=10)
-
-            T.clear(C_local)
-
-            for ko in T.Pipelined((K // block_K), num_stages=stage):
-
-                # Load A into shared memory
-                for i, k in T.Parallel(block_M, block_K):
-                    A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
-
-                # Load B into shared memory
-                for j, k in T.Parallel(block_N, block_K):
-                    B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
-
-                for ki in T.serial(0, (block_K // micro_size_k)):
-
-                    # Load A into fragment
-                    mma_emitter.ldmatrix_a(
-                        A_local,
-                        A_shared,
-                        ki,
-                    )
-
-                    # Load B into fragment
-                    mma_emitter.ldmatrix_b(
-                        B_local,
-                        B_shared,
-                        ki,
-                    )
-
-                    # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local)
-
-            # Perform STMatrix
-            mma_emitter.stmatrix(
-                C_local,
-                C_shared,
-            )
-
-            # Store shared into global
-            for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i, bx * block_N + j] = C_shared[
-                    i // micro_size_x,
-                    j // micro_size_y,
-                    i % micro_size_x,
-                    j % micro_size_y,
-                ]
-
-    return main
-
-
-def assert_tl_matmul_macro_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
-    matmul = tl_matmul_macro(N, K, in_dtype, out_dtype, accum_dtype)
-
-    kernel = tilelang.compile(matmul, out_idx=[2])
-    src_code = kernel.get_kernel_source()
-
-    # src_code is the generated cuda source
-    assert src_code is not None
-
-    A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, accum_dtype))
-
-    C = kernel(A, B)
-
-    # Get Reference Result
-    ref_c = torch.matmul(A, B.T).to(getattr(torch, accum_dtype))
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def tl_matmul_block(
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_correctness(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block(
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, out_idx=[2])
-
-    A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    C = kernel(A, B)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def tl_matmul_block_all_dynamic(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_all_dynamic_correctness(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block_all_dynamic(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-    dynamic_alignment=8,
-):
-    program = tl_matmul_block_all_dynamic(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    pass_configs = {
-        tilelang.PassConfigKey.TL_DISABLE_DYNAMIC_TAIL_SPLIT: dynamic_alignment != 0,
-        tilelang.PassConfigKey.TL_DYNAMIC_ALIGNMENT: dynamic_alignment
-    }
-    if M % 64 == 0 or N % 64 == 0 or K % 64 != 0:
-        # workaround for hopper tma lower pass
-        pass_configs[tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER] = True
-        pass_configs[tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED] = True
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def test_assert_tl_matmul_macro():
-    assert_tl_matmul_macro_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_macro_correctness(66, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_macro_correctness(32, 128, 128, "float16", "float16", "float16")
-
-
-def test_assert_tl_matmul_block():
-    assert_tl_matmul_block_correctness(128, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(67, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(36, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-
-
-def test_assert_tl_matmul_block_all_dynamic():
-    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, "float16",
-                                                   "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
-
-
-def test_assert_tl_matmul_block_all_dynamic_with_pass_config():
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        128,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 60, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=4)
-    # Tail split is enabled with dynamic alignment 0
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 64, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=0)
-
-
-if __name__ == "__main__":
-    # tilelang.testing.main()
-    assert_tl_matmul_macro_correctness(128, 128, 128, "float16", "float16", "float16")
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
deleted file mode 100644
index b5ccbda924f8767969b4a82bdd2a37d11ace3f5b..0000000000000000000000000000000000000000
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
+++ /dev/null
@@ -1,556 +0,0 @@
-import torch
-from tilelang import tvm as tvm
-import tilelang.testing
-import tilelang.language as T
-
-
-def tl_matmul_block_static(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_static(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block_static(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        accum_dtype,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    # print(kernel.get_kernel_source())
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench()
-    print(f"Static Latency: {latency} ms")
-
-
-def tl_matmul_block_dynamic_m(
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_m(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_m(
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic M Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def tl_matmul_block_dynamic_mn(
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_mn(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_mn(
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic MN Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def tl_matmul_block_dynamic_mnk(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_mnk(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_mnk(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic MNK Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def run_assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, "float16",
-                                  "float16", "float32")
-
-
-def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_m(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
-    assert_tl_matmul_block_dynamic_m(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
-
-
-def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_mn(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
-    assert_tl_matmul_block_dynamic_mn(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
-
-
-def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_mnk(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 4
-        })
-    assert_tl_matmul_block_dynamic_mnk(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
-
-
-def test_all():
-    run_assert_tl_matmul_block_static(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_m(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_mn(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_mnk(1024, 1024, 1024, 128, 128, 32)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/fastmath/test_mathops_fastmath.py b/testing/python/fastmath/test_mathops_fastmath.py
index c3b5d1b5288ef09252426869a5816268e991052d..e181eb4dffe973f13aae267c6328d4a2a926bb05 100644
--- a/testing/python/fastmath/test_mathops_fastmath.py
+++ b/testing/python/fastmath/test_mathops_fastmath.py
@@ -1,3 +1,4 @@
+import pytest
 import tilelang
 import tilelang.language as T
 import torch
@@ -7,16 +8,16 @@ import re
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +28,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +50,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +58,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +72,8 @@ def run_single_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +86,22 @@ def run_single_arg_mathop_test(mathop_name,
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +110,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +120,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -145,7 +134,7 @@ def run_two_arg_mathop_test(mathop_name,
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -171,8 +160,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +173,8 @@ def run_abs_test():
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +189,19 @@ def run_abs_test():
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,18 +210,19 @@ def run_fastmath_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -259,13 +243,9 @@ def run_fastmath_mathop_test(mathop_name,
     print(f"✓ {mathop_name} numerical test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_mathops_generate_no_fastmath():
-    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
-    # Based on test results, our tl.* intrinsics actually generate
-    # no fastmath versions
-    # This appears to be the intended behavior
-    single_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("exp", T.exp),
         ("exp2", T.exp2),
         ("exp10", T.exp10),
@@ -287,24 +267,26 @@ def test_mathops_generate_no_fastmath():
         ("trunc", T.trunc),
         ("round", T.round),
         ("nearbyint", T.nearbyint),
-    ]
-
-    for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_mathops_generate_no_fastmath(name, func):
+    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
+    run_single_arg_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_two_arg_mathops_fastmath():
-    """Test all two-argument mathops"""
-    # Two argument mathops
-    two_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("pow", T.pow),
         ("fmod", T.fmod),
-    ]
-
-    for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_two_arg_mathops_fastmath(name, func):
+    """Test all two-argument mathops"""
+    run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -313,11 +295,9 @@ def test_abs_maps_to_fabs():
     run_abs_test()
 
 
-@tilelang.testing.requires_cuda
-def test_fastmath_versions():
-    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
-    # Test fastmath versions
-    fastmath_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("__exp", T.__exp),
         ("__exp10", T.__exp10),
         ("__log", T.__log),
@@ -326,11 +306,13 @@ def test_fastmath_versions():
         ("__tan", T.__tan),
         ("__cos", T.__cos),
         ("__sin", T.__sin),
-    ]
-
-    for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_fastmath_versions(name, func):
+    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
+    run_fastmath_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2315ef21ee31f5cb5edea9ea1893f31c685bc2d
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -0,0 +1,34 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },
+)
+def _cumsum_view_infer_layout(hidden):
+    num_tokens = T.dynamic("num_tokens")
+
+    @T.prim_func
+    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), T.float]):
+        with T.Kernel(num_tokens, threads=128) as pid:
+            smem = T.alloc_shared((hidden,), dtype=T.float32)
+            T.copy(x[pid, :], smem)
+            T.cumsum(T.view(smem, (1, hidden)), dim=1)
+
+    return buggy_kernel
+
+
+def test_cumsum_view_infer_layout():
+    hidden = 128
+    x = torch.randn(1, hidden, device="cuda", dtype=torch.float)
+    kernel = _cumsum_view_infer_layout(hidden)
+    kernel(x)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1008.py b/testing/python/issue/test_tilelang_issue_1008.py
index 395593d8cbea1efb5fa172ce457c4956df06bb73..a35a18449c5474a6122603418ff869cd6c8f235b 100644
--- a/testing/python/issue/test_tilelang_issue_1008.py
+++ b/testing/python/issue/test_tilelang_issue_1008.py
@@ -8,12 +8,13 @@ from tilelang import language as T
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },)
+    },
+)
 def _fill_with_static_region_kernel():
-    num_tokens = T.symbolic('num_tokens')
+    num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), 'int64']):  # noqa: F821
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
         with T.Kernel(num_tokens, threads=128) as _:
             T.fill(x[0:128], 0)
 
@@ -24,14 +25,15 @@ def _fill_with_static_region_kernel():
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },)
+    },
+)
 def _fill_with_dynamic_region_kernel():
-    num_tokens = T.symbolic('num_tokens')
+    num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), 'int64']):  # noqa: F821
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
         with T.Kernel(num_tokens, threads=128) as _:
-            a, b = T.alloc_var('int'), T.alloc_var('int')
+            a, b = T.alloc_var(T.int), T.alloc_var(T.int)
             T.fill(x[a:b], 0)
 
     return buggy_kernel
@@ -39,15 +41,15 @@ def _fill_with_dynamic_region_kernel():
 
 def test_fill_with_static_region_kernel():
     kernel = _fill_with_static_region_kernel()
-    x = torch.zeros((256,), dtype=torch.int64, device='cuda')
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
 def test_fill_with_dynamic_region_kernel():
     kernel = _fill_with_dynamic_region_kernel()
-    x = torch.zeros((256,), dtype=torch.int64, device='cuda')
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1115.py b/testing/python/issue/test_tilelang_issue_1115.py
index 1769862356540a1580b00054d7560015ed7e1d0c..658c126a049e9f8cdfdee44847780e4b9243b3ae 100644
--- a/testing/python/issue/test_tilelang_issue_1115.py
+++ b/testing/python/issue/test_tilelang_issue_1115.py
@@ -4,25 +4,23 @@ import tilelang.language as T
 
 
 def test_int64_address():
-
     @tilelang.jit
     def set_cache_kernel(
         S,
         D,
-        pos_ty='int64',
-        dtype="float32",
+        pos_ty="int64",
+        dtype=T.float32,
     ):
-
         @T.prim_func
         def main(
-                pos: T
-            .Tensor(
+            pos: T.Tensor(
                 [
                     S,
-                ], pos_ty
+                ],
+                pos_ty,
             ),  # type: ignore  `TypeError: Check failed: (a.dtype() == b.dtype()) is false: mismatched types. int64 vs. int32`
-                value: T.Tensor([S, D], dtype),  # type: ignore
-                cache: T.Tensor([S, D], dtype),  # type: ignore
+            value: T.Tensor([S, D], dtype),  # type: ignore
+            cache: T.Tensor([S, D], dtype),  # type: ignore
         ):
             with T.Kernel(S, threads=128) as bx:
                 slot = pos[bx]
@@ -34,11 +32,11 @@ def test_int64_address():
     D = 2
     S = 10
     cache = torch.rand((S, D), device="cuda", dtype=torch.float32)
-    value = torch.rand((S, D), device='cuda', dtype=torch.float32)
-    pos_int64 = torch.arange(S, device='cuda', dtype=torch.int64)
-    pos_int32 = torch.arange(S, device='cuda', dtype=torch.int32)
-    kernel_int64 = set_cache_kernel(S, D, 'int64')
-    kernel_int32 = set_cache_kernel(S, D, 'int32')
+    value = torch.rand((S, D), device="cuda", dtype=torch.float32)
+    pos_int64 = torch.arange(S, device="cuda", dtype=torch.int64)
+    pos_int32 = torch.arange(S, device="cuda", dtype=torch.int32)
+    kernel_int64 = set_cache_kernel(S, D, "int64")
+    kernel_int32 = set_cache_kernel(S, D, T.int32)
     kernel_int64(pos_int64, value, cache)
     torch.testing.assert_close(cache, value)
     kernel_int32(pos_int32, value, cache)
diff --git a/testing/python/issue/test_tilelang_issue_1198.py b/testing/python/issue/test_tilelang_issue_1198.py
index eb9ed45964cb766bd36b1b0f919d8c1d33f4478c..e6330e4356e6ebaaf4d62a8cd267de1acb660674 100644
--- a/testing/python/issue/test_tilelang_issue_1198.py
+++ b/testing/python/issue/test_tilelang_issue_1198.py
@@ -3,13 +3,17 @@ import tilelang.language as T
 
 
 def test_issue_1198():
-
     @T.prim_func
-    def foo(x: T.Buffer([
-        32,
-    ], "int32")):
+    def foo(
+        x: T.Buffer(
+            [
+                32,
+            ],
+            T.int32,
+        ),
+    ):
         pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1210.py b/testing/python/issue/test_tilelang_issue_1210.py
index 971fb81936582cfc71a5049256afa6daa1c6d6f9..2e141d7829fa61865d91690b5852faf68d3f3e4b 100644
--- a/testing/python/issue/test_tilelang_issue_1210.py
+++ b/testing/python/issue/test_tilelang_issue_1210.py
@@ -4,10 +4,10 @@ import tilelang.testing
 
 
 def _make_kernel(M, N):
-    dtype = "bfloat16"
+    dtype = T.bfloat16
 
     @T.prim_func
-    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), "int32")):
+    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), T.int32)):
         with T.Kernel(4, threads=1):
             A = T.alloc_shared([N], dtype)
             B = T.alloc_shared([N], dtype)
diff --git a/testing/python/issue/test_tilelang_issue_1237.py b/testing/python/issue/test_tilelang_issue_1237.py
index a9aadc5ee87471ddd1560c098c61380433dd23f4..bb936e4686595e7050a92ad6ffc40b1cd751dcbb 100644
--- a/testing/python/issue/test_tilelang_issue_1237.py
+++ b/testing/python/issue/test_tilelang_issue_1237.py
@@ -7,12 +7,12 @@ def test_issue_1237_dynamic_copy_extent_builds():
     # The goal is to ensure T.copy correctly handles dynamic extents
     # (e.g., src slice length vs. static dst buffer size) during prim_func building.
 
-    length = T.symbolic("len", dtype="int32")
+    length = T.symbolic("len", dtype=T.int32)
 
     @T.prim_func
-    def sample_kernel(global_tensor: T.Tensor[(length,), "int32"]):  # noqa: F821
+    def sample_kernel(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
         with T.Kernel(1, threads=32):
-            buffer_shared = T.alloc_shared((1024,), dtype="int32")
+            buffer_shared = T.alloc_shared((1024,), dtype=T.int32)
             T.copy(global_tensor[0:length], buffer_shared)
 
     # Building the prim_func is sufficient to exercise the bug path; no need to JIT/execute.
diff --git a/testing/python/issue/test_tilelang_issue_814.py b/testing/python/issue/test_tilelang_issue_814.py
index 1a9e63d2992b90ca263724d472190a20514116e0..f9f94bd744f0722232e0d7ebcc413ecfec2eb4b8 100644
--- a/testing/python/issue/test_tilelang_issue_814.py
+++ b/testing/python/issue/test_tilelang_issue_814.py
@@ -5,12 +5,11 @@ import torch
 
 
 @tilelang.jit
-def _tmp_var_kernel(N, block_N, dtype="float"):
-
+def _tmp_var_kernel(N, block_N, dtype=T.float32):
     @T.prim_func
     def kernel(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:
             for i in T.Parallel(block_N):
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index ab59371225cb163d8f14dc4d097444e7105ffde7..1a2a909d27e79bb77c6fe52e604ba64550dd5e8c 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -8,7 +8,6 @@ import tilelang.language as T
 
 @tilelang.jit
 def _empty_kernel():
-
     @T.prim_func
     def empty_kernel():
         with T.Kernel(1, threads=32) as thread_idx:
@@ -17,7 +16,15 @@ def _empty_kernel():
     return empty_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_lowering():
+    # Ensure a valid CUDA runtime context is current on this thread for the
+    # target device before using driver API calls. Without this, calls like
+    # cuModuleLoadData can fail with CUDA_ERROR_INVALID_CONTEXT, especially
+    # for kernels that don't touch any device memory or streams beforehand
+    # (e.g., "empty" kernels) and therefore haven't triggered context
+    # creation implicitly.
+    torch.cuda.set_device(0)
     kernel = _empty_kernel()
     kernel()
 
@@ -27,7 +34,7 @@ def _empty_with_dead_code_kernel():
     num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float32"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float32]):
         with T.Kernel(num_tokens, threads=32) as pid:
             y = x[pid]
 
@@ -43,7 +50,6 @@ def test_empty_with_dead_code_kernel():
 
 @tilelang.jit
 def _empty_kernel_with_binding_variants(use_tuple_binding: bool = False):
-
     @T.prim_func
     def kernel_with_tuple_kernel_binding():
         with T.Kernel(1, threads=32) as (pid,):
@@ -59,7 +65,9 @@ def _empty_kernel_with_binding_variants(use_tuple_binding: bool = False):
     return kernel_with_tuple_kernel_binding if use_tuple_binding else kernel_with_scalar_kernel_binding
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_with_binding_variants():
+    torch.cuda.set_device(0)
     kernel = _empty_kernel_with_binding_variants()
     kernel()
 
diff --git a/testing/python/issue/test_tilelang_issue_96.py b/testing/python/issue/test_tilelang_issue_96.py
index e42ebb59e6021bbc2822280cdd4eca9e8e862714..9bf5c69bd80f262915e4551faed7ff8991993ce1 100644
--- a/testing/python/issue/test_tilelang_issue_96.py
+++ b/testing/python/issue/test_tilelang_issue_96.py
@@ -4,19 +4,17 @@ import tilelang.language as T
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+            bx,
+            by,
+        ):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
diff --git a/testing/python/issue/test_tilelang_issue_merge_if.py b/testing/python/issue/test_tilelang_issue_merge_if.py
index 1db7f337c64d1bd328c91dc8bc30a4fbdf2c9c89..e3b1e3082922afe3bf5b3fe1775fe57f4dcc9fd8 100644
--- a/testing/python/issue/test_tilelang_issue_merge_if.py
+++ b/testing/python/issue/test_tilelang_issue_merge_if.py
@@ -6,13 +6,12 @@ import tilelang.language as T
 
 
 def merge_if_test():
-
     @T.prim_func
     def main():
-        A = T.alloc_fragment((1,), "float16")
-        B = T.alloc_fragment((1,), "float16")
-        C = T.alloc_fragment((1,), "float16")
-        D = T.alloc_fragment((1,), "float16")
+        A = T.alloc_fragment((1,), T.float16)
+        B = T.alloc_fragment((1,), T.float16)
+        C = T.alloc_fragment((1,), T.float16)
+        D = T.alloc_fragment((1,), T.float16)
         if A[0] == 0:
             A[0] = 0
         if B[0] == 0:
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index d5aa00a4dde3314ccf8c3fbda769d812c88ab0e4..98b88820cbab58a0bfe4fe237019f12d009c72f9 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -1,4 +1,4 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
 from tilelang.engine.callback import register_cuda_postproc_callback
@@ -25,13 +25,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -91,7 +89,9 @@ def run_gemm(
         code = f"// {stramp}\n" + code
         return code
 
+    tilelang.disable_cache()
     matmul_kernel = tilelang.compile(program, out_idx=-1)
+    tilelang.enable_cache()
 
     kernel_source = matmul_kernel.get_kernel_source()
 
@@ -105,9 +105,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -135,13 +135,11 @@ def matmu_jit_kernel(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -205,7 +203,6 @@ def run_gemm_jit_kernel(
         B = B.T
 
     def ref_program(A, B):
-        import torch
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -223,9 +220,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py b/testing/python/jit/test_tilelang_jit_cutedsl.py
similarity index 66%
rename from testing/python/jit/test_tilelang_jit_gemm_ctypes.py
rename to testing/python/jit/test_tilelang_jit_cutedsl.py
index fd5243f00f65eba65552bdb6225987568175eb3d..7c613c4d1cceb04915b54b4a48d68bbb22d24f08 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py
+++ b/testing/python/jit/test_tilelang_jit_cutedsl.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -85,12 +85,12 @@ def run_gemm(
 
     stramp = "&*(XS)"
 
-    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
-    def tilelang_callback_cuda_postproc(code, _):
-        code = f"// {stramp}\n" + code
+    @tvm.register_global_func("tilelang_callback_cutedsl_postproc", override=True)
+    def tilelang_callback_cutedsl_postproc(code, _):
+        code = f"# {stramp}\n" + code
         return code
 
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, out_idx=-1, target="cutedsl")
 
     kernel_source = matmul_kernel.get_kernel_source()
 
@@ -114,7 +114,7 @@ def test_gemm_f16f16f16_nn():
     )
 
 
-def matmu_jit_kernel(
+def matmul_jit_kernel(
     M,
     N,
     K,
@@ -134,13 +134,11 @@ def matmu_jit_kernel(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -177,7 +175,7 @@ def run_gemm_jit_kernel(
     num_stages=3,
     num_threads=128,
 ):
-    program = matmu_jit_kernel(
+    program = matmul_jit_kernel(
         M,
         N,
         K,
@@ -193,7 +191,7 @@ def run_gemm_jit_kernel(
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, out_idx=-1, target="cutedsl")
 
     in_dtype = map_torch_type(in_dtype)
     out_dtype = map_torch_type(out_dtype)
@@ -208,6 +206,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -235,19 +234,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_ctypes_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
+def run_cutedsl_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -264,14 +253,14 @@ def run_ctypes_kernel_do_bench(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
 
     profiler = matmul_kernel.get_profiler()
 
-    ctypes_latency = profiler.do_bench(func=matmul_kernel)
-    print(f"Ctypes Latency: {ctypes_latency} ms")
+    cutedsl_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"CuTeDSL Latency: {cutedsl_latency} ms")
 
-    assert ctypes_latency is not None
+    assert cutedsl_latency is not None
 
     tvm_latency = profiler.do_bench()
     print(f"TVM Latency: {tvm_latency} ms")
@@ -279,24 +268,13 @@ def run_ctypes_kernel_do_bench(M,
     assert tvm_latency is not None
 
 
-def test_ctypes_kernel_do_bench():
-    run_ctypes_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_ctypes_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
+def test_cutedsl_kernel_do_bench():
+    run_cutedsl_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -313,7 +291,7 @@ def run_ctypes_kernel_multi_stream(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
     in_dtype = map_torch_type(in_dtype)
     out_dtype = map_torch_type(out_dtype)
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
@@ -332,24 +310,13 @@ def run_ctypes_kernel_multi_stream(M,
             matmul_kernel(tensor_a, tensor_b, tensor_c)
 
 
-def test_ctypes_kernel_multi_stream():
-    run_ctypes_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_ctypes_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
+def test_cutedsl_kernel_multi_stream():
+    run_cutedsl_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -366,7 +333,7 @@ def run_ctypes_dynamic_shape(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
     if isinstance(M, T.Var):
         M = 1024
     if isinstance(N, T.Var):
@@ -389,23 +356,26 @@ def run_ctypes_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
-def test_ctypes_dynamic_shape():
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+def test_cutedsl_dynamic_shape():
+    run_cutedsl_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2
+    )
 
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
 
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
-    test_gemm_f16f16f16_nn()
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_gemm.py b/testing/python/jit/test_tilelang_jit_gemm.py
index 25c19a058bafc5a3360528d1e6fe09697848c359..97391f26f37ad6bf064ed453f221668ebc32f548 100644
--- a/testing/python/jit/test_tilelang_jit_gemm.py
+++ b/testing/python/jit/test_tilelang_jit_gemm.py
@@ -1,4 +1,4 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
 import torch
@@ -27,13 +27,11 @@ def matmul_kernel_jit(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -95,7 +93,6 @@ def run_gemm_kernel_jit(
         B = B.T
 
     def ref_program(A, B):
-        import torch
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -113,9 +110,9 @@ def test_gemm_f16f16f16_nn_kernel_jit():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py
index 12524f12910ca18beee3f9c25442b27c8111936b..c5399fc51f1a12aafa5c3dae3e5e5f8e1dff6508 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_cython.py
+++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -104,9 +104,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -138,9 +138,9 @@ def matmu_jit_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -208,6 +208,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -225,9 +226,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -235,19 +236,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_cython_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
+def run_cython_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -287,23 +278,12 @@ def run_cython_kernel_do_bench(M,
 
 
 def test_cython_kernel_do_bench():
-    run_cython_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_cython_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
+    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_cython_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -342,23 +322,12 @@ def run_cython_kernel_multi_stream(M,
 
 
 def test_cython_kernel_multi_stream():
-    run_cython_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
+    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -398,36 +367,20 @@ def run_cython_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape():
-    run_cython_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape_with_out_idx(M,
-                                          N,
-                                          K,
-                                          trans_A,
-                                          trans_B,
-                                          in_dtype,
-                                          out_dtype,
-                                          dtypeAccum,
-                                          block_M,
-                                          block_N,
-                                          block_K,
-                                          num_stages=3,
-                                          num_threads=128):
+    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape_with_out_idx(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -467,13 +420,11 @@ def run_cython_dynamic_shape_with_out_idx(M,
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
 
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape_with_out_idx():
-    run_cython_dynamic_shape_with_out_idx(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def matmul_int_variable(
@@ -498,10 +449,10 @@ def matmul_int_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.int32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.int32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -525,10 +476,10 @@ def matmul_int_variable(
     return main
 
 
-def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                            out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                  out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_int_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -544,8 +495,7 @@ def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B
 
 
 def test_matmul_int_variable():
-    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                            "float32", 0, 128)
+    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 def matmul_float_variable(
@@ -570,10 +520,10 @@ def matmul_float_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.float32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.float32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -597,10 +547,10 @@ def matmul_float_variable(
     return main
 
 
-def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                              out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                    out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_float_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -616,8 +566,7 @@ def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans
 
 
 def test_matmul_float_variable():
-    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                              "float32", 0, 128)
+    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
index 6241ea90c373741d44b2a307e81db5a8ee0581d6..a9edb5e930ac5687aed24dd44cc4a5e8685930d9 100644
--- a/testing/python/jit/test_tilelang_jit_nullptr.py
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -7,57 +7,13 @@ from tilelang.utils import map_torch_type
 
 
 @tl.jit
-def ptr_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
-    @T.prim_func
-    def main(
-        a_ptr: T.ptr,
-        b_ptr: T.ptr,
-        c_ptr: T.ptr,
-        bias_ptr: T.ptr,
-        m: T.int32,
-        n: T.int32,
-        k: T.int32,
-        with_bias: T.bool,
-    ):
-        A = T.make_tensor(a_ptr, (m, k), dtype)
-        B = T.make_tensor(b_ptr, (k, n), dtype)
-        C = T.make_tensor(c_ptr, (m, n), accum_dtype)
-        Bias = T.make_tensor(bias_ptr, (n), accum_dtype)
-
-        # Initialize Kernel Context
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_N, block_K), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(k, block_K), num_stages=3):
-                # Copy tile of A
-                T.copy(A[by * block_M, ko * block_K], A_shared)
-                T.copy(B[bx * block_N, ko * block_K], B_shared)
-                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
-
-            if with_bias:
-                for i, j in T.Parallel(block_M, block_N):
-                    C_local[i, j] += Bias[bx * block_N + j]
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-@tl.jit
-def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32, with_bias=False):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), accum_dtype),
-            Bias: T.Tensor((N), accum_dtype),
-            with_bias: T.bool,
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), accum_dtype),
+        Bias: T.Tensor((N), accum_dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -82,30 +38,12 @@ def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_
     return main
 
 
-def run_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    func = ptr_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-
+def run_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
     c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
-    d = torch.randn(N, device="cuda", dtype=map_torch_type(accum_dtype))
-
-    func(a, b, c, None, M, N, K, False)
-
-    ref_no_bias = (a @ b.T).to(map_torch_type(accum_dtype))
-    ref_with_bias = ref_no_bias + d
-
-    torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
-
-    func(a, b, c, d, M, N, K, True)
-
-    torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
-
-    func = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    func(a, b, c, None, False)
-    torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
-    func(a, b, c, d, True)
-    torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
+    kernel = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype, with_bias=False)
+    kernel(a, b, c, None)
 
 
 def test_nullptr():
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6823b8cca054045b3e751347c73e89a1555af04
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -0,0 +1,506 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    stramp = "&*(XS)"
+
+    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
+    def tilelang_callback_cuda_postproc(code, _):
+        code = f"// {stramp}\n" + code
+        return code
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    kernel_source = matmul_kernel.get_kernel_source()
+
+    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
+
+
+def test_gemm_f16f16f16_nn():
+    run_gemm(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_nvrtc_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+
+    profiler = matmul_kernel.get_profiler()
+
+    nvrtc_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"NVRTC Latency: {nvrtc_latency} ms")
+
+    assert nvrtc_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_nvrtc_kernel_do_bench():
+    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_nvrtc_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_nvrtc_kernel_multi_stream():
+    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_nvrtc_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_nvrtc_dynamic_shape():
+    run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.annotate_layout(
+                {
+                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                    kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+                }
+            )
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_nvrtc_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
+    """Test im2col TMA descriptor functionality in NVRTC backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_nvrtc_im2col_tma_desc():
+    """Test im2col TMA descriptor with NVRTC backend."""
+    if not check_hopper():
+        import pytest
+
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_nvrtc_im2col_tma_desc(
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
+
+
+def test_nvrtc_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="nvrtc")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype=T.float32,
+    ):
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_parcompile.py b/testing/python/jit/test_tilelang_jit_parcompile.py
index e7bcec41250373380ec429d025819db434ebbf6b..56201e1cc5aabf0b6e3e56e691ba4d624ada901e 100644
--- a/testing/python/jit/test_tilelang_jit_parcompile.py
+++ b/testing/python/jit/test_tilelang_jit_parcompile.py
@@ -1,6 +1,7 @@
 import tilelang.testing
 import tilelang
 import torch
+from tilelang import language as T
 
 
 @tilelang.jit(
@@ -16,9 +17,9 @@ def matmul_kernel_jit(
     block_K,
     trans_A=False,
     trans_B=True,
-    in_dtype='float16',
-    out_dtype='float32',
-    accum_dtype='float32',
+    in_dtype=T.float16,
+    out_dtype=T.float32,
+    accum_dtype=T.float32,
     num_stages=2,
     threads=128,
 ):
@@ -31,9 +32,9 @@ def matmul_kernel_jit(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0df2719213676b0a27e018a80439da0af7fa53f
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -0,0 +1,454 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_tvm_ffi_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+
+    profiler = matmul_kernel.get_profiler()
+
+    tvm_ffi_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"tvm_ffi Latency: {tvm_ffi_latency} ms")
+
+    assert tvm_ffi_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_tvm_ffi_kernel_do_bench():
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_tvm_ffi_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_tvm_ffi_kernel_multi_stream():
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_tvm_ffi_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_dynamic_shape():
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2
+    )
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.annotate_layout(
+                {
+                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                    kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+                }
+            )
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_tvm_ffi_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
+    """Test im2col TMA descriptor functionality in tvm_ffi backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_im2col_tma_desc():
+    """Test im2col TMA descriptor with tvm_ffi backend."""
+    if not check_hopper():
+        import pytest
+
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_tvm_ffi_im2col_tma_desc(
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
+
+
+def test_tvm_ffi_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="tvm_ffi")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype=T.float32,
+    ):
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    source = kernel.get_host_source()
+    assert "__tvm_cuda_stream_set_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_set_access_policy_window_packed in the kernel source"
+    )
+    assert "__tvm_cuda_stream_reset_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_reset_access_policy_window_packed in the kernel source"
+    )
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index b4509fadc350dc74888537ac040e3e26e5076557..97d050b73012f30a11feb442ab8147493a494604 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -6,7 +6,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,22 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -61,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -106,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -119,10 +124,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -130,7 +137,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -140,7 +146,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -216,7 +221,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
index 3ec6ae03040da1403d6e005ec173a30b412c707c..501b38fda8c6e11b59a7df424b00434454e25e39 100644
--- a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
+++ b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
@@ -1,5 +1,5 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
 import torch
 
 
@@ -12,19 +12,17 @@ def elementwise_add(
     out_dtype,
     threads,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), in_dtype),
-            B: T.Tensor((M, N), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, N), in_dtype),
+        B: T.Tensor((M, N), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             start_x = bx * block_N
             start_y = by * block_M
 
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 y = start_y + local_y
                 x = start_x + local_x
 
@@ -67,8 +65,8 @@ def test_elementwise_add_f32():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
         128,
         256,
     )
@@ -78,8 +76,8 @@ def test_elementwise_add_f16():
     run_elementwise_add(
         512,
         1024,
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
         128,
         256,
     )
@@ -89,8 +87,8 @@ def test_elementwise_add_i32():
     run_elementwise_add(
         512,
         1024,
-        "int32",
-        "int32",
+        T.int32,
+        T.int32,
         128,
         256,
     )
@@ -100,8 +98,8 @@ def test_elementwise_add_f32f16():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float16",
+        T.float32,
+        T.float16,
         128,
         256,
     )
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
index 19f327d66c3abb816e130e886dd379a5648d2513..276083b26252070bb614fc1a2d8c09a748b4a60a 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
@@ -12,12 +12,11 @@ def calc_diff(x, y):
 
 
 def matmul_nt(M, N, K, bM, bN, bK, in_dtype, out_dtype, accum_dtype):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((N, K), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((N, K), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=128) as (bx, by):
             A_shared = T.alloc_shared((bM, bK), in_dtype)
@@ -44,8 +43,7 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 
     C = kernel(A, B)
 
-    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)),
-                         B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
+    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)), B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
     print(C)
     print(ref_c)
     diff = calc_diff(C, ref_c)
@@ -56,8 +54,8 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9)
 def test_assert_matmul():
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e4m3", "float32", "float32")
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e5m2", "float32", "float32")
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e4m3fn, T.float32, T.float32)
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index 34def174d0fae80d2acb4f8700156d8d452b69bb..9ba369b6b966f7a32e4174f63b510c1d04b0bc49 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -6,7 +6,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,21 +39,26 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -60,7 +66,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -105,12 +111,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -118,10 +123,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -129,7 +136,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -139,7 +145,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -216,8 +221,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
index afd01f337b6972727a9c2732ecc6111377985619..7b757992a714a58f54f2b0010818b1b258f42d70 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -46,20 +46,19 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def gemv_simt(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def gemv_simt(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -166,8 +164,8 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_kernel_gemm.py
index 5dcde1d5ec4f4c37414837f1fc12ba5634124997..6dc95e98ada55e145a5e72317630ef0587452f76 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -22,13 +23,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -92,11 +91,11 @@ def run_gemm(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -111,9 +110,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -128,9 +127,9 @@ def test_gemm_f16f16f32_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         128,
         32,
@@ -144,9 +143,9 @@ def test_gemm_bf16bf16f32_nn():
         768,
         False,
         False,
-        "bfloat16",
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.bfloat16,
+        T.float32,
         128,
         128,
         32,
@@ -160,9 +159,9 @@ def test_gemm_f32f32f32_nn():
         768,
         False,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -176,9 +175,9 @@ def test_gemm_f16f16f16_tn():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -193,9 +192,9 @@ def test_gemm_f16f16f16_nt():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -204,15 +203,15 @@ def test_gemm_f16f16f16_nt():
 
 
 def test_gemm_i8i8i32_nt():
-    run_gemm(512, 1024, 768, False, True, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, False, True, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
 def test_gemm_i8i8i32_tn():
-    run_gemm(512, 1024, 768, True, False, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, True, False, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
 def test_gemm_f64f64f64_nt():
-    run_gemm(512, 512, 512, False, True, "float64", "float64", "float64", 64, 32, 16)
+    run_gemm(512, 512, 512, False, True, T.float64, T.float64, T.float64, 64, 32, 16)
 
 
 def test_gemm_f32f32f32_nt():
@@ -222,9 +221,9 @@ def test_gemm_f32f32f32_nt():
         768,
         False,
         True,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -238,9 +237,9 @@ def test_gemm_f32f32f32_tn():
         768,
         True,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -254,9 +253,9 @@ def test_pad_aligned_f16f16f16_nn():
         768 - 24,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -271,9 +270,9 @@ def test_pad_f16f16f16_nn():
         768 - 5,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -288,9 +287,9 @@ def test_pad_f16f16f32_nn():
         768 + 15,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         64,
         32,
@@ -321,9 +320,9 @@ def matmul_sr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -407,9 +406,9 @@ def test_gemm_f16f16f16_sr():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -441,9 +440,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
@@ -526,9 +525,9 @@ def test_gemm_f16f16f16_rs():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index da2e12cdcae717317bd9b4a9641b584b9d023e76..dd1b75ebc590ed9189b71f46e8db40734de879db 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -6,7 +6,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,22 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -61,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -106,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -119,10 +124,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -130,7 +137,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -140,7 +146,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -214,22 +219,22 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul_fp8():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
index 548497c72989010a76f15012c29bf91bb5cfafc5..584aa854a715d61ab306a7be153fdd35973e1cd9 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
@@ -35,13 +35,13 @@ def tl_matmul_simt(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     # This is a debug config
@@ -72,16 +72,15 @@ def tl_matmul_simt(
 
     micro_size_k = 128 // DataType(in_dtype).bits
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
 
@@ -97,7 +96,6 @@ def tl_matmul_simt(
             T.clear(C_local)
 
             for ko in T.serial(K // block_K):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -109,29 +107,24 @@ def tl_matmul_simt(
                 for ki in T.serial((block_K // micro_size_k)):
                     for i in T.serial(local_size_a):
                         for mk in T.vectorized(micro_size_k):
-                            A_local[i, mk] = A_shared[warp_m * local_size_a + i,
-                                                      ki * micro_size_k + mk]
+                            A_local[i, mk] = A_shared[warp_m * local_size_a + i, ki * micro_size_k + mk]
 
                     for i in T.serial(local_size_b):
                         for mk in T.vectorized(micro_size_k):
-                            B_local[i, mk] = B_shared[warp_n * local_size_b + i,
-                                                      ki * micro_size_k + mk]
+                            B_local[i, mk] = B_shared[warp_n * local_size_b + i, ki * micro_size_k + mk]
 
                     for i, j in T.grid(local_size_a, local_size_b):
                         for mk in T.serial(micro_size_k // dp4a_size):
                             if use_dp4a:
-                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size],
-                                       C_local[i * local_size_b + j])
+                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size], C_local[i * local_size_b + j])
                             else:
                                 for dp4a_idx in T.serial(dp4a_size):
-                                    C_local[i * local_size_b +
-                                            j] += A_local[i, mk * dp4a_size +
-                                                          dp4a_idx] * B_local[j, mk * dp4a_size +
-                                                                              dp4a_idx]
+                                    C_local[i * local_size_b + j] += (
+                                        A_local[i, mk * dp4a_size + dp4a_idx] * B_local[j, mk * dp4a_size + dp4a_idx]
+                                    )
 
             for i, j in T.grid(local_size_a, local_size_b):
-                C[by * block_M + warp_m * local_size_a + i,
-                  bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
+                C[by * block_M + warp_m * local_size_a + i, bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
 
     return main
 
@@ -146,7 +139,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
 
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -168,9 +161,9 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
index bbc2e79e2ef609cdd014e022957e808f2a25fb79..1f76600325369d7e21a0b9c275042ea9f7ea60a2 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
@@ -4,13 +4,12 @@ import tilelang.language as T
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -59,7 +58,8 @@ def run_gemm_with_stride_ss(M: int, N: int, K: int, block_M: int, block_N: int,
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create random input tensors on the GPU
     a = torch.randn(M, K, device="cuda", dtype=torch.float16)
     b = torch.randn(K, N, device="cuda", dtype=torch.float16)
diff --git a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
index 86d6acbda6a36c8ad73d1e90975a46f4d0626d57..b4a5c8249239c6002028f101b037ff6295cb74f8 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -46,20 +46,19 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def gemv_simt(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def gemv_simt(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -166,15 +164,15 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float16", "float16", "float16", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "int8", "int32", "int32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float16, T.float16, T.float16, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.int8, T.int32, T.int32, with_bias=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt_fp8():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
index 5cdd6710563906eed2da42e78d44d3f1e76659dc..9d60e5229ad455d2c57c98774e9d82279fa5b691 100644
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -4,7 +4,8 @@ from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang.language as T
 from tilelang.intrinsics import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
 
 from tilelang.intrinsics.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,
@@ -25,20 +26,20 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     K = K // 2
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -46,7 +47,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -91,12 +92,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -104,10 +104,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -115,7 +117,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -125,7 +126,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -168,7 +168,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
         out_idx=[2],
         pass_configs={
             tilelang.PassConfigKey.TL_DEBUG_MERGE_SHARED_MEMORY_ALLOCATIONS: True,
-        })
+        },
+    )
     print(kernel.get_kernel_source())
     profiler = kernel.get_profiler()
 
@@ -196,8 +197,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul_correctness():
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(128, 128, 64, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(128, 128, 64, T.int8, T.int32, T.int32)
 
 
 @simplify_prim_func
@@ -211,18 +212,18 @@ def tl_matmul_weight_only_transform(
 ):
     K = K // 2
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     transform_b = 3
@@ -232,7 +233,7 @@ def tl_matmul_weight_only_transform(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -285,12 +286,11 @@ def tl_matmul_weight_only_transform(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -298,10 +298,12 @@ def tl_matmul_weight_only_transform(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -309,19 +311,15 @@ def tl_matmul_weight_only_transform(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
 
                 # Load B into shared memory
-                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k,
-                                               micro_size_y, micro_size_k):
-                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j,
-                                               ko * (block_K // micro_size_k) + k, jj, kk]
+                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k, micro_size_y, micro_size_k):
+                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j, ko * (block_K // micro_size_k) + k, jj, kk]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -359,6 +357,7 @@ def tl_matmul_weight_only_transform(
 
 def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     import bitblas
+
     matmul = tl_matmul_weight_only_transform(M, N, K, in_dtype, out_dtype, accum_dtype)
     kernel = tilelang.compile(matmul, out_idx=[2])
     profiler = kernel.get_profiler()
@@ -376,8 +375,8 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
         M=N,
         N=(K // 2),
-        datatype="int8",
-        storage_dtype="int8",
+        datatype=T.int8,
+        storage_dtype=T.int8,
         transform_kind=transform_b,
         transpose_matrix=True,
     )
@@ -401,9 +400,9 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
 @tilelang.testing.requires_package("bitblas")
 @tilelang.testing.requires_llvm
 def test_assert_tl_matmul_weight_only_transform():
-    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
diff --git a/testing/python/language/test_tilelang_capture.py b/testing/python/language/test_tilelang_capture.py
new file mode 100644
index 0000000000000000000000000000000000000000..47fec999a2531f0f8d82dd5d9145175827e47e6f
--- /dev/null
+++ b/testing/python/language/test_tilelang_capture.py
@@ -0,0 +1,41 @@
+import tilelang.language as T
+import tilelang.testing
+import torch
+import weakref
+import gc
+
+
+def test_tilelang_capture():
+    @tilelang.jit(
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    def get_dummy_kernel():
+        @T.prim_func
+        def dummy_kernel(
+            a: T.Tensor[(1,), T.float32],
+        ):
+            with T.Kernel(1) as _:
+                a[0] = 1
+
+        return dummy_kernel
+
+    a = torch.randn(1, 1024)
+    a_weak = weakref.ref(a)
+    _kernel = get_dummy_kernel()
+    del a
+    torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+    a_upgrade = a_weak()
+    assert a_upgrade is None, "A is not garbage collected"
+
+    # use objgraph to debug
+    # if a_upgrade is not None:
+    #     objgraph.show_backrefs([a_upgrade], max_depth=5)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_intimm.py b/testing/python/language/test_tilelang_intimm.py
new file mode 100644
index 0000000000000000000000000000000000000000..46c2c79873ebd079aac21cf6f50ea1738f0105f5
--- /dev/null
+++ b/testing/python/language/test_tilelang_intimm.py
@@ -0,0 +1,28 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_tilelang_intimm():
+    T.int32(0x7FFFFFFF)
+    T.int32(-0x7FFFFFFF - 1)
+    T.uint32(0xFFFFFFFF)
+    T.int64(0x7FFFFFFFFFFFFFFF)
+    T.int64(-0x7FFFFFFFFFFFFFFF - 1)
+    T.uint64(0xFFFFFFFFFFFFFFFF)
+
+    a = T.int32()
+    a & 0x7FFFFFFF
+
+    a = T.uint32()
+    a & 0xFFFFFFFF
+
+    a = T.int64()
+    a & 0x7FFFFFFFFFFFFFFF
+
+    a = T.uint64()
+    a & T.uint64(0xFFFFFFFFFFFFFFFF)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index c99d36102f57294bad7357ea6845b1015dc97c6a..48fe1ac4d8d377394e2321bb28ddb83f7eca9824 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -4,13 +4,12 @@ import tilelang.language as T
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -44,7 +43,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
     kernel = tilelang.compile(program, out_idx=[2], target="cuda")
     kernel.run_once()
diff --git a/testing/python/language/test_tilelang_language_all_of.py b/testing/python/language/test_tilelang_language_all_of.py
index 73233ec871230ed5e5c43389c67b9b8ea08d29b7..db694d337620fb8d2e76e60d4931d7dfae39593e 100644
--- a/testing/python/language/test_tilelang_language_all_of.py
+++ b/testing/python/language/test_tilelang_language_all_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.all(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -32,18 +31,17 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -77,18 +75,17 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -127,18 +124,17 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 202d6bfaa6453b751e2ce50da94eca6ebf039add..883f65c3cb0487ff7a7533684c660089f960fc56 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -1,4 +1,5 @@
 import tilelang.testing
+from tilelang import language as T
 
 
 def alloc_var(
@@ -6,12 +7,10 @@ def alloc_var(
     block_N,
     dtype,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -38,7 +37,7 @@ def run_alloc_var(
 
 
 def test_alloc_var():
-    run_alloc_var(1024, 128, "float16")
+    run_alloc_var(1024, 128, T.float16)
 
 
 def alloc_var_add(
@@ -50,8 +49,8 @@ def alloc_var_add(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -78,7 +77,7 @@ def run_alloc_var_add(
 
 
 def test_alloc_var_add():
-    run_alloc_var_add(1024, 128, "float16")
+    run_alloc_var_add(1024, 128, T.float16)
 
 
 def alloc_var_with_initializer(
@@ -91,8 +90,8 @@ def alloc_var_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp = T.alloc_var(dtype, init_value)
@@ -113,12 +112,11 @@ def run_alloc_var_with_initializer(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    print(code)
     assert f"= {init_value};" in code
 
 
 def test_alloc_var_with_initializer():
-    run_alloc_var_with_initializer(256, 64, "int32", 5)
+    run_alloc_var_with_initializer(256, 64, T.int32, 5)
 
 
 def alloc_multi_vars_with_initializer(
@@ -130,8 +128,8 @@ def alloc_multi_vars_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp0 = T.alloc_var(dtype, 1)
@@ -151,14 +149,13 @@ def run_alloc_multi_vars_with_initializer(
     program = alloc_multi_vars_with_initializer(N, block_N, dtype)
 
     kernel = tilelang.compile(program, out_idx=[1])
-    code = kernel.get_kernel_source()
-    print(code)
+    code = kernel.get_kernel_source(kernel_only=True)
     assert code.count("= 1;") == 1
     assert code.count("= 2;") == 1
 
 
 def test_alloc_multi_vars_with_initializer():
-    run_alloc_multi_vars_with_initializer(256, 64, "int32")
+    run_alloc_multi_vars_with_initializer(256, 64, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9aeeac6b137189655ef21c6740572077b339ba
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -0,0 +1,74 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def test_tensor_annot_mul():
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n * 4,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n * 4):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_add():
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n + 1,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_mul_add():
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n * 3 + 1,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n * 3 + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index 3d616ac1e0bc37c2bb0a75bb17229b52f6a2c98f..3c8239a15705a260a07cf370b321be3bebe12f27 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -6,12 +6,11 @@ import torch
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
-
+def tilelang_copy(M, N, block_M, block_N, dtype=T.float16, pad_value=0):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -27,16 +26,11 @@ def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16", pad_value=0):
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16, pad_value=0):
     program = tilelang_copy(M, N, block_M, block_N, dtype, pad_value=pad_value)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     ref_b = torch.zeros_like(a)
diff --git a/testing/python/language/test_tilelang_language_any_of.py b/testing/python/language/test_tilelang_language_any_of.py
index 354d32cd07cc709b68889437c0a350edf2421ee6..74db94f7c29502bd2898650cf5ac3cfcbef1e4b6 100644
--- a/testing/python/language/test_tilelang_language_any_of.py
+++ b/testing/python/language/test_tilelang_language_any_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.any(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -32,18 +31,17 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -77,18 +75,17 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -127,18 +124,17 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_assume.py b/testing/python/language/test_tilelang_language_assume.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e92dfa998c99009f641325d5811aa90af667a3
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_assume.py
@@ -0,0 +1,86 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def test_assume_remove_boundary_check():
+    @tilelang.jit
+    def kernel_with_assume():
+        N = T.dynamic("N")
+
+        @T.prim_func
+        def main(A: T.Tensor((N,), T.float32), l: T.int32, r: T.int32):
+            with T.Kernel(1, threads=32) as _:
+                for i in T.serial(r - l + 1):
+                    T.assume(l + i >= 0 and l + i < N)
+                    A[l + i] = 0
+
+        return main
+
+    jit_kernel = kernel_with_assume()
+    source = jit_kernel.get_kernel_source()
+
+    assert "if (" not in source
+
+
+def test_assume_enable_vectorization():
+    @tilelang.jit
+    def kernel_vectorize(M):
+        N = T.dynamic("N")
+        vectorize_size = 4
+
+        @T.prim_func
+        def main(
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+
+                base_idx = tid * 4
+                T.assume(N % vectorize_size == 0)
+
+                for i in T.vectorized(vectorize_size):
+                    T.assume(base_idx + i < N)
+                    B[tid, base_idx + i] = A[tid, base_idx + i]
+
+        return main
+
+    jit_kernel = kernel_vectorize(128)
+    source = jit_kernel.get_kernel_source()
+
+    assert ("float4" in source) and ("if (" not in source)
+
+
+def test_assume_complex_indexing():
+    @tilelang.jit
+    def kernel_complex():
+        M = T.dynamic("M")
+        N = T.dynamic("N")
+
+        @T.prim_func
+        def main(
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+                for j in T.serial(N):
+                    i_src = T.min(j + 233, tid + 2)
+                    j_src = j * T.ceildiv(j, i_src) * j - 1
+
+                    T.assume(i_src >= 0 and i_src < M)
+                    T.assume(j_src >= 0 and j_src < N)
+
+                    B[tid, j] = A[i_src, j_src]
+
+        return main
+
+    jit_kernel = kernel_complex()
+    source = jit_kernel.get_kernel_source()
+
+    assert "if (" not in source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index 42c33e54d25d40d8ecece8fa61f2d029ce869656..fa4dff7b38b34c0ce56a7569c8f473521ab48f13 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -3,15 +3,13 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -19,7 +17,7 @@ def atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_add
 
 
-def run_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -38,22 +36,20 @@ def run_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def tile_atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
+def tile_atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             T.atomic_add(B[bx * block_M, by * block_N], A_shared)
 
     return atomic_add
 
 
-def run_tile_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
+def run_tile_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = tile_atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
     print(kernel.get_kernel_source())
     import torch
@@ -75,15 +71,13 @@ def run_tile_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_max_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_max_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_max(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -91,7 +85,7 @@ def atomic_max_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_max
 
 
-def run_atomic_max(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_max(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -110,15 +104,13 @@ def run_atomic_max(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_min_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_min_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_min(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -126,7 +118,7 @@ def atomic_min_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_min
 
 
-def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_min(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -137,7 +129,7 @@ def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
                     B[i, j] = min(B[i, j], A[k, i, j])
 
     A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
     ref_B = B.clone()
     ref_program(A, ref_B)
     kernel(A, B)
@@ -145,8 +137,7 @@ def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_load_store_program(M, N, block_M, block_N, dtype="float"):
-
+def atomic_load_store_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
@@ -160,7 +151,7 @@ def atomic_load_store_program(M, N, block_M, block_N, dtype="float"):
     return atomic_load_store
 
 
-def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_load_store(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_load_store_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -171,24 +162,21 @@ def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_memory_order_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_memory_order_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_with_memory_order(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
-                T.atomic_add(
-                    B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
 
     return atomic_with_memory_order
 
 
-def run_atomic_memory_order(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_memory_order(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_memory_order_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -208,9 +196,8 @@ def run_atomic_memory_order(K, M, N, block_M, block_N, dtype="float32"):
 
 @tilelang.jit
 def atomic_addx2_program(M, N, block_M, block_N):
-
     @T.prim_func
-    def atomic_addx2(A: T.Tensor((M, N), "float16"), B: T.Tensor((M, N), "float16")):
+    def atomic_addx2(A: T.Tensor((M, N), T.float16), B: T.Tensor((M, N), T.float16)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N // 2):
                 idx_i = bx * block_M + i
@@ -236,70 +223,69 @@ def run_atomic_addx2(M, N, block_M, block_N):
     torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
 
 
-@tilelang.jit
-def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"):
+def test_atomic_add():
+    run_atomic_add(8, 128, 128, 32, 32)
+
+
+def test_atomic_max():
+    run_atomic_max(4, 64, 64, 16, 16)
+
+
+def test_atomic_min():
+    run_atomic_min(4, 64, 64, 16, 16)
+
+
+def test_atomic_load_store():
+    run_atomic_load_store(64, 64, 16, 16)
+
+
+def test_atomic_memory_order():
+    run_atomic_memory_order(4, 64, 64, 16, 16)
+
+
+def test_atomic_addx2():
+    run_atomic_addx2(32, 64, 8, 16)
 
+
+@tilelang.jit
+def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
-    def atomic_different_orders(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor(
-        (M, N), dtype), D: T.Tensor((M, N), dtype)):
+    def atomic_different_orders(
+        A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor((M, N), dtype), D: T.Tensor((M, N), dtype)
+    ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
                 idx_i = bx * block_M + i
                 idx_j = by * block_N + j
                 if idx_i < M and idx_j < N:
                     val = A[idx_i, idx_j]
-                    T.atomic_add(B[idx_i, idx_j], val, memory_order="relaxed")
-                    T.atomic_max(C[idx_i, idx_j], val, memory_order="acquire")
-                    T.atomic_min(D[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_add(B[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_max(C[idx_i, idx_j], val, memory_order="relaxed")
+                    T.atomic_min(D[idx_i, idx_j], val, memory_order="relaxed")
 
     return atomic_different_orders
 
 
-def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
     A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
     B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
     C = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    D = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
+    D = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
 
     kernel(A, B, C, D)
 
     torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
     torch.testing.assert_close(C, torch.maximum(torch.zeros_like(A), A))
-    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float('inf')), A))
-
-
-def test_atomic_add():
-    run_atomic_add(8, 128, 128, 32, 32)
-
-
-def test_atomic_max():
-    run_atomic_max(4, 64, 64, 16, 16)
-
-
-def test_atomic_min():
-    run_atomic_min(4, 64, 64, 16, 16)
-
-
-def test_atomic_load_store():
-    run_atomic_load_store(64, 64, 16, 16)
-
-
-def test_atomic_memory_order():
-    run_atomic_memory_order(4, 64, 64, 16, 16)
-
-
-def test_atomic_addx2():
-    run_atomic_addx2(32, 64, 8, 16)
+    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float("inf")), A))
 
 
 @tilelang.jit
 def atomic_addx4_program(M, N, block_M, block_N):
-
     @T.prim_func
-    def atomic_addx4(A: T.Tensor((M, N), "float32"), B: T.Tensor((M, N), "float32")):
+    def atomic_addx4(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N // 4):
                 idx_i = bx * block_M + i
@@ -329,23 +315,20 @@ def run_atomic_addx4(M, N, block_M, block_N):
 
 
 @tilelang.jit
-def atomic_return_prev_program(M, N, block_M, block_N, dtype="float"):
-
+def atomic_return_prev_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
-    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype),
-                                old_vals: T.Tensor((M, N), dtype)):
+    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), old_vals: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
                 idx_i = bx * block_M + i
                 idx_j = by * block_N + j
                 if idx_i < M and idx_j < N:
-                    old_vals[idx_i, idx_j] = T.atomic_add(
-                        B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
+                    old_vals[idx_i, idx_j] = T.atomic_add(B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
 
     return atomic_with_return_prev
 
 
-def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_return_prev(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_return_prev_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -361,7 +344,9 @@ def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
 
 
 def test_atomic_different_memory_orders():
-    run_atomic_different_memory_orders(32, 32, 8, 8)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float32)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float16)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.bfloat16)
 
 
 def test_atomic_addx4():
@@ -372,10 +357,9 @@ def test_atomic_return_prev():
     run_atomic_return_prev(32, 32, 8, 8)
 
 
-# TODO(lei): test failed and this is experimental
-# CC @dyq
-# def test_tile_atomic_add():
-#     run_tile_atomic_add(8, 128, 128, 32, 32)
+def test_tile_atomic_add():
+    run_tile_atomic_add(8, 128, 128, 32, 32)
+
 
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_ceildiv.py b/testing/python/language/test_tilelang_language_ceildiv.py
index 35201a074397cede52b8b01b53949a73ca3f4989..f5af31b83971de1fd425e6aca2d29b043990b75f 100644
--- a/testing/python/language/test_tilelang_language_ceildiv.py
+++ b/testing/python/language/test_tilelang_language_ceildiv.py
@@ -5,9 +5,8 @@ import torch
 
 @tilelang.jit(out_idx=[-1])
 def _ceildiv_kernel(a: int, b: int):
-
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32")):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32)):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
@@ -30,9 +29,8 @@ def test_ceildiv():
 
 @tilelang.jit
 def _ceildiv_kernel_dyn(b: int):
-
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32"), a: T.int32):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32), a: T.int32):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
diff --git a/testing/python/language/test_tilelang_language_chain_equal.py b/testing/python/language/test_tilelang_language_chain_equal.py
index 696a9c70b046a11af08be8ebe1d5572be784fe7b..083eefdcb493de9e77ed332db3e276adbe0e4008 100644
--- a/testing/python/language/test_tilelang_language_chain_equal.py
+++ b/testing/python/language/test_tilelang_language_chain_equal.py
@@ -8,14 +8,14 @@ import torch
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },)
-def chain_equal(N, block_size, dtype="float32"):
-
+    },
+)
+def chain_equal(N, block_size, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as bx:
             for lane in T.Parallel(block_size):
@@ -25,7 +25,7 @@ def chain_equal(N, block_size, dtype="float32"):
     return main
 
 
-def run_chain_equal(N=128, block_size=64, dtype="float32"):
+def run_chain_equal(N=128, block_size=64, dtype=T.float32):
     kernel = chain_equal(N, block_size, dtype)
     A = torch.zeros((N,), dtype=torch.float32, device="cuda")
     B = torch.zeros((N,), dtype=torch.float32, device="cuda")
diff --git a/testing/python/language/test_tilelang_language_clamp.py b/testing/python/language/test_tilelang_language_clamp.py
index 4a2f177918cea6a34ac1b17a34b133abe1ada0f2..372d7478468c8890aa8ab56152414952b695c0f5 100644
--- a/testing/python/language/test_tilelang_language_clamp.py
+++ b/testing/python/language/test_tilelang_language_clamp.py
@@ -1,5 +1,5 @@
 import tilelang.testing
-from tilelang.utils.tensor import map_torch_type
+from tilelang import language as T
 
 
 def clamp_within_bounds(
@@ -13,8 +13,8 @@ def clamp_within_bounds(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -56,8 +56,8 @@ def clamp_value_range(
 
     @T.prim_func
     def main(
-            A: T.Tensor((1, N), dtype),
-            B: T.Tensor((1, N), dtype),
+        A: T.Tensor((1, N), dtype),
+        B: T.Tensor((1, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             # A_shared = T.alloc_shared([1, block_N], dtype=dtype)
@@ -91,7 +91,7 @@ def run_clamp_value_range(
     import torch
 
     # Convert string dtype to torch.dtype
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = dtype.as_torch()
 
     def ref_program(A):
         min_val = torch.min(A) * 0.5
@@ -107,10 +107,10 @@ def run_clamp_value_range(
 
 def test_clamp():
     # clamp tests for float16 and float32
-    run_clamp(1024, 128, "float16", -0.05, 0.05)
-    run_clamp(1024, 128, "float32", -0.06, 0.05)
-    run_clamp_value_range(1024, 128, "float16")
-    run_clamp_value_range(1024, 128, "float32")
+    run_clamp(1024, 128, T.float16, -0.05, 0.05)
+    run_clamp(1024, 128, T.float32, -0.06, 0.05)
+    run_clamp_value_range(1024, 128, T.float16)
+    run_clamp_value_range(1024, 128, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_clear.py b/testing/python/language/test_tilelang_language_clear.py
index be3d808f4d254c0fdc6aeb0fd62fb24e66ed1360..af9d89631f725985adad445db2ef919d29c784eb 100644
--- a/testing/python/language/test_tilelang_language_clear.py
+++ b/testing/python/language/test_tilelang_language_clear.py
@@ -4,13 +4,12 @@ import tilelang.language as T
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -40,12 +39,12 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
     import torch
     from tilelang.utils import map_torch_type
+
     a = torch.randn((M, K), dtype=map_torch_type(dtype)).cuda()
     b = torch.randn((N, K), dtype=map_torch_type(dtype)).cuda()
     c = kernel(a, b)
diff --git a/testing/python/language/test_tilelang_language_composable_index.py b/testing/python/language/test_tilelang_language_composable_index.py
index ac2254f3034f51993c9374803db3dc04a7e7a806..7893c1f2438a02b00bf9d4b3d71b6464c4ec6c6e 100644
--- a/testing/python/language/test_tilelang_language_composable_index.py
+++ b/testing/python/language/test_tilelang_language_composable_index.py
@@ -6,12 +6,11 @@ import torch
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_composable_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_composable_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M * N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M * N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -26,7 +25,7 @@ def tilelang_composable_copy(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_composable_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
@@ -35,7 +34,8 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b.flatten(), a.flatten(), rtol=1e-2, atol=1e-2)
@@ -44,7 +44,7 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
 def test_tilelang_copy():
     run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
+    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index 4a2ddee8e4b327d96274fd8eb99c9c7d52ab22e5..29bb0f9514851ef5a696e27ab349d300d49aca0d 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -3,34 +3,37 @@ import tilelang.language as T
 import torch
 import tilelang.testing
 
+print(torch.__version__)
+
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy(M, N, block_M, block_N, src_dtype=T.float16, dst_dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), src_dtype),
+        B: T.Tensor((M, N), dst_dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j]
+            T.copy(
+                A[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+                B[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+            )
 
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
-    program = tilelang_copy(M, N, block_M, block_N, dtype)
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=dtype, dst_dtype=dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
         target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
+    source = kernel.get_kernel_source()
+    print(source)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -39,15 +42,14 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
 def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
-
+    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
-def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
 
+def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.StridedTensor((M, N), (NN, 1), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.StridedTensor((M, N), (NN, 1), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -57,12 +59,7 @@ def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_with_stride(M=1024,
-                                  N=1024,
-                                  NN=2048,
-                                  block_M=128,
-                                  block_N=128,
-                                  dtype="float16"):
+def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128, dtype=T.float16):
     if isinstance(NN, int):
         assert NN > N, "NN must be greater than N"
     program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
@@ -73,7 +70,8 @@ def run_tilelang_copy_with_stride(M=1024,
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        })
+        },
+    )
     if isinstance(NN, T.Var):
         NN = N * 2
     a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
@@ -86,43 +84,39 @@ def test_tilelang_copy_with_stride():
     run_tilelang_copy_with_stride(M=1024, N=1024, NN=T.dynamic("NN"), block_M=128, block_N=128)
 
 
-def tilelang_copy_bufferload(num_tokens, dtype="float16"):
-
+def tilelang_copy_bufferload(num_tokens, dtype=T.float16):
     @T.prim_func
     def main(
-            indices: T.Tensor((num_tokens,), "int32"),
-            x: T.Tensor((num_tokens,), dtype),
+        indices: T.Tensor((num_tokens,), T.int32),
+        x: T.Tensor((num_tokens,), dtype),
     ):
         with T.Kernel(num_tokens, threads=32) as pid:
-            idx = T.alloc_local([1], "int32")
+            idx = T.alloc_local([1], T.int32)
             T.copy(indices[pid], idx[0])
             x[idx[0]] = x[idx[0]] + 1
 
     return main
 
 
-def run_tilelang_copy_bufferload(num_tokens=128, dtype="float16"):
+def run_tilelang_copy_bufferload(num_tokens=128, dtype=T.float16):
     program = tilelang_copy_bufferload(num_tokens, dtype)
     # test compilation only
     tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
 
 
 def test_tilelang_copy_bufferload():
     run_tilelang_copy_bufferload(num_tokens=128)
 
 
-def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -132,20 +126,14 @@ def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float
     return main
 
 
-def run_tilelang_copy_buffer_load_with_parallel(M=1024,
-                                                N=1024,
-                                                block_M=128,
-                                                block_N=128,
-                                                dtype="float16"):
+def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
         target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -155,5 +143,46 @@ def test_tilelang_copy_buffer_load_with_parallel():
     run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128)
 
 
+def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp8_e8_t" in source
+    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8).view(torch.float8_e8m0fnu)
+    output = kernel(dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp8_e8m0():
+    run_tilelang_copy_fp8_e8m0(src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu)
+
+
+def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp4_e2_t" in source
+    # For FP4, use same shape as kernel expects, since int8 is used as storage type
+    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8)
+    output = kernel(dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp4():
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float16)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.bfloat16)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_cumsum.py b/testing/python/language/test_tilelang_language_cumsum.py
index 0046405351df44fbd606a7872e877439439eca0e..fecc0d2a88b40f1d94e9909add3f749699af0a5f 100644
--- a/testing/python/language/test_tilelang_language_cumsum.py
+++ b/testing/python/language/test_tilelang_language_cumsum.py
@@ -2,15 +2,14 @@ from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
 import torch
+import tilelang.language as T
 
 
-def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
-    import tilelang.language as T
-
+def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -23,13 +22,13 @@ def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float3
     return cumsum
 
 
-def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -44,7 +43,7 @@ def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="fl
     return cumsum
 
 
-def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test(M, N, block_M, block_N, dim, reverse, dtype)
     elif scope == "fragment":
@@ -57,13 +56,16 @@ def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", sc
         ref_b = torch.empty_like(A)
         for i in range(M // block_M):
             for j in range(N // block_N):
-                ref_b[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = A[i * block_M:(i + 1) * block_M, j *
-                                                         block_N:(j + 1) * block_N].cumsum(dim=dim)
+                ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = A[
+                    i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N
+                ].cumsum(dim=dim)
                 if reverse:
-                    ref_b[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                          block_N] = A[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                                       block_N].flip(dims=[dim]).cumsum(dim=dim).flip(dims=[dim])
+                    ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = (
+                        A[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N]
+                        .flip(dims=[dim])
+                        .cumsum(dim=dim)
+                        .flip(dims=[dim])
+                    )
         return ref_b
 
     tilelang_res = jit_kernel(A)
@@ -71,13 +73,13 @@ def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", sc
     torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
 
 
-def cumsum_smem_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_smem_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
@@ -89,13 +91,13 @@ def cumsum_smem_test_1d(N, block_N, reverse=False, dtype="float32"):
     return cumsum
 
 
-def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
@@ -109,7 +111,7 @@ def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype="float32"):
     return cumsum
 
 
-def run_cumsum_1d(N, block_N, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum_1d(N, block_N, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test_1d(N, block_N, reverse, dtype)
     elif scope == "fragment":
@@ -147,8 +149,8 @@ def test_cumsum_smem():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True)
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32")
-    run_cumsum(256, 256, 128, 128, dtype="float32")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
 
 
 def test_cumsum_fragment():
@@ -157,8 +159,8 @@ def test_cumsum_fragment():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True, scope="fragment")
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
 
 
 def test_cumsum_smem_1d():
@@ -171,5 +173,139 @@ def test_cumsum_fragment_1d():
     run_cumsum_1d(1024, 128, reverse=True, scope="fragment")
 
 
+def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype=T.float32):
+    """Test cumsum with buffer region (slice) as input."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((N,), dtype),
+        OutputG_fragment: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, chunk_size), threads=chunk_size) as bx:
+            i = bx
+            chunk_start = i * chunk_size
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((chunk_size,), dtype)
+            T.copy(InputG_fragment[chunk_start : chunk_start + chunk_size], A_shared)
+            # Test cumsum with region input - in-place operation on shared memory
+            # This demonstrates the feature: T.cumsum(region, dim=0)
+            T.cumsum(src=A_shared, dim=0, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(A_shared, OutputG_fragment[chunk_start : chunk_start + chunk_size])
+
+    return cumsum_region
+
+
+def run_cumsum_region_1d(N, chunk_size, reverse=False, dtype=T.float32):
+    """Run test for cumsum with region input."""
+    program = cumsum_region_test_1d(N, chunk_size, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks = (N + chunk_size - 1) // chunk_size
+        for j in range(num_blocks):
+            start = j * chunk_size
+            end = min(start + chunk_size, N)
+            chunk = A[start:end].clone()
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            chunk = chunk.cumsum(dim=0)
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            ref_b[start:end] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def cumsum_region_test_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
+    """Test cumsum with buffer region (slice) as input in 2D."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((M, N), dtype),
+        OutputG_fragment: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
+            chunk_start_M = by * block_M
+            chunk_start_N = bx * block_N
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            T.copy(
+                InputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+                A_shared,
+            )
+            # Test cumsum with 2D region input - in-place operation on shared memory
+            T.cumsum(src=A_shared, dim=dim, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(
+                A_shared,
+                OutputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+            )
+
+    return cumsum_region
+
+
+def run_cumsum_region_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
+    """Run test for cumsum with 2D region input."""
+    program = cumsum_region_test_2d(M, N, block_M, block_N, dim, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks_M = (M + block_M - 1) // block_M
+        num_blocks_N = (N + block_N - 1) // block_N
+        for i in range(num_blocks_M):
+            for j in range(num_blocks_N):
+                start_M = i * block_M
+                end_M = min(start_M + block_M, M)
+                start_N = j * block_N
+                end_N = min(start_N + block_N, N)
+                chunk = A[start_M:end_M, start_N:end_N].clone()
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                chunk = chunk.cumsum(dim=dim)
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                ref_b[start_M:end_M, start_N:end_N] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def test_cumsum_region_1d():
+    """Test cumsum with 1D region input."""
+    # Test normal cumsum with region input
+    run_cumsum_region_1d(1024, 128)
+    # Test reverse cumsum with region input
+    run_cumsum_region_1d(1024, 128, reverse=True)
+    # Test with different chunk sizes
+    run_cumsum_region_1d(512, 64)
+    run_cumsum_region_1d(2048, 256)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_1d(1000, 128)
+
+
+def test_cumsum_region_2d():
+    """Test cumsum with 2D region input."""
+    # Test 2D cumsum along dim 0
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=0)
+    # Test 2D cumsum along dim 1
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=1)
+    # Test reverse cumsum
+    run_cumsum_region_2d(512, 512, 64, 64, dim=1, reverse=True)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_2d(1000, 1000, 128, 128, dim=1)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index fb3f1e15a809fa5f668661540440920654196a96..67115e8c2fd6b921f7eaa5f13996a0ef2d8d775d 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -8,7 +8,6 @@ from tvm.tir.expr import IntImm, Var
 
 
 def test_argument():
-
     @T.prim_func
     def test_argument(
         t_1: T.bool,
@@ -41,6 +40,7 @@ def test_argument():
 
 def test_expr():
     from tilelang.language.v2.dtypes import _all_dtypes
+
     errors = []
     for name in _all_dtypes:
         dtype = getattr(T, name)
@@ -116,95 +116,94 @@ def test_expr():
 
 
 def test_dtype_str_repr():
-
     @T.prim_func
     def test_str_repr():
-        buf_1 = T.alloc_buffer((1,), dtype=T.bool, scope='shared')  # noqa F841
-        buf_2 = T.alloc_buffer((1,), dtype=T.short, scope='shared')  # noqa F841
-        buf_3 = T.alloc_buffer((1,), dtype=T.int, scope='shared')  # noqa F841
-        buf_4 = T.alloc_buffer((1,), dtype=T.long, scope='shared')  # noqa F841
-        buf_5 = T.alloc_buffer((1,), dtype=T.half, scope='shared')  # noqa F841
-        buf_6 = T.alloc_buffer((1,), dtype=T.float, scope='shared')  # noqa F841
-        buf_7 = T.alloc_buffer((1,), dtype=T.long, scope='shared')  # noqa F841
-        buf_8 = T.alloc_buffer((1,), dtype=T.int8, scope='shared')  # noqa F841
-        buf_9 = T.alloc_buffer((1,), dtype=T.int16, scope='shared')  # noqa F841
-        buf_10 = T.alloc_buffer((1,), dtype=T.int32, scope='shared')  # noqa F841
-        buf_11 = T.alloc_buffer((1,), dtype=T.int64, scope='shared')  # noqa F841
-        buf_12 = T.alloc_buffer((1,), dtype=T.uint8, scope='shared')  # noqa F841
-        buf_13 = T.alloc_buffer((1,), dtype=T.uint16, scope='shared')  # noqa F841
-        buf_14 = T.alloc_buffer((1,), dtype=T.uint32, scope='shared')  # noqa F841
-        buf_15 = T.alloc_buffer((1,), dtype=T.uint64, scope='shared')  # noqa F841
-        buf_16 = T.alloc_buffer((1,), dtype=T.float8_e4m3fn, scope='shared')  # noqa F841
-        buf_17 = T.alloc_buffer((1,), dtype=T.float8_e4m3fnuz, scope='shared')  # noqa F841
-        buf_18 = T.alloc_buffer((1,), dtype=T.float8_e5m2, scope='shared')  # noqa F841
-        buf_19 = T.alloc_buffer((1,), dtype=T.float8_e5m2fnuz, scope='shared')  # noqa F841
-        buf_20 = T.alloc_buffer((1,), dtype=T.float8_e8m0fnu, scope='shared')  # noqa F841
-        buf_21 = T.alloc_buffer((1,), dtype=T.float16, scope='shared')  # noqa F841
-        buf_22 = T.alloc_buffer((1,), dtype=T.bfloat16, scope='shared')  # noqa F841
-        buf_23 = T.alloc_buffer((1,), dtype=T.float32, scope='shared')  # noqa F841
-        buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope='shared')  # noqa F841
-
-
-def test_torch_eq():
-    dtypes = [
-        T.bool,
-        T.short,
-        T.int,
-        T.long,
-        T.half,
-        T.float,
-        T.long,
-        T.int8,
-        T.int16,
-        T.int32,
-        T.int64,
-        T.uint8,
-        T.uint16,
-        T.uint32,
-        T.uint64,
-        T.float8_e4m3fn,
-        T.float8_e4m3fnuz,
-        T.float8_e5m2,
-        T.float8_e5m2fnuz,
-        T.float8_e8m0fnu,
-        T.float16,
-        T.bfloat16,
-        T.float32,
-        T.float64,
-    ]
-    torch_dtypes = [
-        torch.bool,
-        torch.short,
-        torch.int,
-        torch.long,
-        torch.half,
-        torch.float,
-        torch.long,
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.uint8,
-        torch.uint16,
-        torch.uint32,
-        torch.uint64,
-        torch.float8_e4m3fn,
-        torch.float8_e4m3fnuz,
-        torch.float8_e5m2,
-        torch.float8_e5m2fnuz,
-        torch.float8_e8m0fnu,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-        torch.float64,
-    ]
-    for a, b in zip(dtypes, torch_dtypes):
-        assert a == b, f"{a} and {b} are not equal"
-        assert T.dtype(b) == a, "dtype conversion error"
+        buf_1 = T.alloc_buffer((1,), dtype=T.bool, scope="shared")  # noqa F841
+        buf_2 = T.alloc_buffer((1,), dtype=T.short, scope="shared")  # noqa F841
+        buf_3 = T.alloc_buffer((1,), dtype=T.int, scope="shared")  # noqa F841
+        buf_4 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_5 = T.alloc_buffer((1,), dtype=T.half, scope="shared")  # noqa F841
+        buf_6 = T.alloc_buffer((1,), dtype=T.float, scope="shared")  # noqa F841
+        buf_7 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_8 = T.alloc_buffer((1,), dtype=T.int8, scope="shared")  # noqa F841
+        buf_9 = T.alloc_buffer((1,), dtype=T.int16, scope="shared")  # noqa F841
+        buf_10 = T.alloc_buffer((1,), dtype=T.int32, scope="shared")  # noqa F841
+        buf_11 = T.alloc_buffer((1,), dtype=T.int64, scope="shared")  # noqa F841
+        buf_12 = T.alloc_buffer((1,), dtype=T.uint8, scope="shared")  # noqa F841
+        buf_13 = T.alloc_buffer((1,), dtype=T.uint16, scope="shared")  # noqa F841
+        buf_14 = T.alloc_buffer((1,), dtype=T.uint32, scope="shared")  # noqa F841
+        buf_15 = T.alloc_buffer((1,), dtype=T.uint64, scope="shared")  # noqa F841
+        buf_16 = T.alloc_buffer((1,), dtype=T.float8_e4m3fn, scope="shared")  # noqa F841
+        buf_17 = T.alloc_buffer((1,), dtype=T.float8_e4m3fnuz, scope="shared")  # noqa F841
+        buf_18 = T.alloc_buffer((1,), dtype=T.float8_e5m2, scope="shared")  # noqa F841
+        buf_19 = T.alloc_buffer((1,), dtype=T.float8_e5m2fnuz, scope="shared")  # noqa F841
+        buf_20 = T.alloc_buffer((1,), dtype=T.float8_e8m0fnu, scope="shared")  # noqa F841
+        buf_21 = T.alloc_buffer((1,), dtype=T.float16, scope="shared")  # noqa F841
+        buf_22 = T.alloc_buffer((1,), dtype=T.bfloat16, scope="shared")  # noqa F841
+        buf_23 = T.alloc_buffer((1,), dtype=T.float32, scope="shared")  # noqa F841
+        buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope="shared")  # noqa F841
+
+
+# not supported now
+# def test_torch_eq():
+#     dtypes = [
+#         T.bool,
+#         T.short,
+#         T.int,
+#         T.long,
+#         T.half,
+#         T.float,
+#         T.long,
+#         T.int8,
+#         T.int16,
+#         T.int32,
+#         T.int64,
+#         T.uint8,
+#         T.uint16,
+#         T.uint32,
+#         T.uint64,
+#         T.float8_e4m3fn,
+#         T.float8_e4m3fnuz,
+#         T.float8_e5m2,
+#         T.float8_e5m2fnuz,
+#         T.float8_e8m0fnu,
+#         T.float16,
+#         T.bfloat16,
+#         T.float32,
+#         T.float64,
+#     ]
+#     torch_dtypes = [
+#         torch.bool,
+#         torch.short,
+#         torch.int,
+#         torch.long,
+#         torch.half,
+#         torch.float,
+#         torch.long,
+#         torch.int8,
+#         torch.int16,
+#         torch.int32,
+#         torch.int64,
+#         torch.uint8,
+#         torch.uint16,
+#         torch.uint32,
+#         torch.uint64,
+#         torch.float8_e4m3fn,
+#         torch.float8_e4m3fnuz,
+#         torch.float8_e5m2,
+#         torch.float8_e5m2fnuz,
+#         torch.float8_e8m0fnu,
+#         torch.float16,
+#         torch.bfloat16,
+#         torch.float32,
+#         torch.float64,
+#     ]
+#     for a, b in zip(dtypes, torch_dtypes):
+#         assert a == b, f"{a} and {b} are not equal"
+#         assert T.dtype(b) == a, "dtype conversion error"
 
 
 def test_var_assign():
-
     @tilelang.jit(out_idx=-1)
     @T.prim_func
     def test_var_assign(A: T.Tensor((2,), T.int32)):
@@ -222,7 +221,6 @@ def test_var_assign():
 
 
 def test_marco_return():
-
     @T.macro
     def macro_return_constant():
         return 0
@@ -251,17 +249,16 @@ def test_marco_return():
             c = macro_return_expr(4.0)
             d = macro_apply_func(5.0, lambda x: x * 2.0)
             check(a, (int, float, T.PrimExpr))
-            check(b, T.PrimExpr)
-            check(c, T.PrimExpr)
-            check(d, T.PrimExpr)
+            check(b, (int, float, T.PrimExpr))
+            check(c, (int, float, T.PrimExpr))
+            check(d, (int, float, T.PrimExpr))
 
 
 def test_prim_func_generator():
-
     @T.prim_func(generator=True)
     def prim_func_gen(
-            A=T.Tensor((128,), T.float32),  # noqa: B008
-            B=T.Tensor((128,), T.float32),  # noqa: B008
+        A=T.Tensor((128,), T.float32),  # noqa: B008
+        B=T.Tensor((128,), T.float32),  # noqa: B008
     ):
         with T.Kernel(128) as (tx,):
             T.copy(A[tx], B[tx])
@@ -276,7 +273,6 @@ def test_prim_func_generator():
 
 
 def test_serial_for_with_step():
-
     @tilelang.jit(out_idx=-1)
     @T.prim_func
     def test_stepped_serial(A: T.Tensor((10,), T.int32)):
@@ -290,7 +286,7 @@ def test_serial_for_with_step():
 
     ker = test_stepped_serial()
     res = ker()
-    ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device='cuda')
+    ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
     @tilelang.jit(out_idx=-1)
@@ -303,17 +299,16 @@ def test_serial_for_with_step():
 
     ker = test_serial_step_neg()
     res = ker()
-    ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device='cuda')
+    ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
     assert isinstance(T.serial(1, 10, 1), IRBuilderFrame)
-    assert isinstance(T.serial(1, 10, IntImm('int32', 1)), IRBuilderFrame)
-    assert not isinstance(T.serial(1, 10, Var('tmp', 'int32')), IRBuilderFrame)
+    assert isinstance(T.serial(1, 10, IntImm(T.int32, 1)), IRBuilderFrame)
+    assert not isinstance(T.serial(1, 10, Var("tmp", T.int32)), IRBuilderFrame)
     assert not isinstance(T.serial(10, -1, -1), IRBuilderFrame)
 
 
 def test_swap_logic():
-
     @tilelang.jit
     @T.prim_func
     def swap_var(A: T.Tensor[(2,), T.float32]):
@@ -343,7 +338,6 @@ def test_swap_logic():
 
 
 def test_while_loop():
-
     @tilelang.jit(out_idx=-1)
     @T.prim_func
     def test_while_loop(A: T.Tensor((1,), T.int32)):
@@ -360,5 +354,121 @@ def test_while_loop():
     assert A[0].item() == sum(range(10)), f"Expected {sum(range(10))}, but got {A[0].item()}"
 
 
-if __name__ == '__main__':
+def test_var_macro():
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert "x[0] = 1" in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert "x[0] = 1" in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
+
+def test_frame_inside_macro():
+    @tilelang.jit
+    def get_sample_kernel():
+        @T.macro
+        def transform(x):
+            return x + 1
+
+        @T.prim_func
+        def sample_kernel(
+            num_blocks: T.int32,
+            idx_out: T.Tensor[(32,), T.int32],
+        ):
+            with T.Kernel(num_blocks, threads=32) as block_idx:  # noqa: F841
+                fragment = T.alloc_fragment(32, T.int32)
+                T.copy(idx_out, fragment)
+
+                for i in T.Parallel(32):
+                    idx_out[i] = transform(fragment[i])
+
+        return sample_kernel
+
+    kernel = get_sample_kernel()  # noqa: F841
+
+
+def test_buffer_slice_step():
+    try:
+
+        @T.prim_func
+        def prim_buffer_slice_step(A: T.Buffer((10,), T.int32), B: T.Buffer((5,), T.int32)):
+            with T.Kernel(1):
+                B[0:5:2] = A[0:10:2]
+
+        raise AssertionError("Expect to report an error, buffer slice with step is not supported")
+    except RuntimeError:
+        pass
+
+
+def test_boolop():
+    a = Var("a", T.int32)
+    b = Var("b", T.int32)
+    c = Var("c", T.int32)
+    d = Var("d", T.int32)
+
+    @T.macro
+    def cond():
+        return not (a < b and b < c and a * d < b * d) or b * d < c * d
+
+    cond()
+
+
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_get_warp_info.py b/testing/python/language/test_tilelang_language_get_warp_info.py
index 68b65fcd4e5c8f92e8747917111ef968f5791748..e14cece9831472a992dbac77177041898e4ced7b 100644
--- a/testing/python/language/test_tilelang_language_get_warp_info.py
+++ b/testing/python/language/test_tilelang_language_get_warp_info.py
@@ -23,9 +23,8 @@ def _resolve_warps_per_group(warps_per_group: Optional[int]) -> int:
 
 @tilelang.jit(out_idx=[-1])
 def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def laneid_kernel(A: T.Tensor((num_threads,), "int32")):
+    def laneid_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_lane_idx(warp_size)
@@ -35,9 +34,8 @@ def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx_sync(warp_size)
@@ -47,9 +45,8 @@ def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] =
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def warp_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx(warp_size)
@@ -63,9 +60,8 @@ def _get_warp_group_idx_kernel(
     warp_size: Optional[int] = None,
     warps_per_group: Optional[int] = None,
 ):
-
     @T.prim_func
-    def warp_group_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_group_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_group_idx(warp_size, warps_per_group)
@@ -75,9 +71,8 @@ def _get_warp_group_idx_kernel(
 
 @tilelang.jit(out_idx=[-1])
 def _shuffle_elect_kernel(num_threads: int = 128, thread_extent: int = 64):
-
     @T.prim_func
-    def shuffle_elect_kernel(A: T.Tensor((num_threads,), "int32")):
+    def shuffle_elect_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             elected = T.shuffle_elect(thread_extent)
diff --git a/testing/python/language/test_tilelang_language_if_range.py b/testing/python/language/test_tilelang_language_if_range.py
index b3550f589b37d827069b9145cbfcdbebe5219bb7..c81a241ba12dd26eef6d5346da8441a582e6a3ab 100644
--- a/testing/python/language/test_tilelang_language_if_range.py
+++ b/testing/python/language/test_tilelang_language_if_range.py
@@ -4,13 +4,14 @@ import torch
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
-def tilelang_if_range(M, N, block_M, block_N, dtype="float16"):
-
+@tilelang.jit(
+    out_idx=[1],
+)
+def tilelang_if_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -26,7 +27,7 @@ def tilelang_if_range(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_if_range(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_infinity.py b/testing/python/language/test_tilelang_language_infinity.py
index 0779bff57abf09b0bc4732f76a7a29319e004df2..746afc4e0404cb495a1ee77b02aa13c005ca8c6f 100644
--- a/testing/python/language/test_tilelang_language_infinity.py
+++ b/testing/python/language/test_tilelang_language_infinity.py
@@ -5,7 +5,6 @@ import tilelang.language as T
 
 @tilelang.jit(out_idx=-1)
 def get_inf_kernel(dtype: str):
-
     @T.prim_func
     def main(A: T.Tensor((32,), dtype)):
         with T.Kernel(1, threads=32):
@@ -18,15 +17,15 @@ def _test_infinity(dtype: str):
     kernel = get_inf_kernel(dtype)
     output = kernel()
 
-    assert torch.all(output == torch.inf), f'check failed for {dtype=}'
+    assert torch.all(output == torch.inf), f"check failed for {dtype=}"
 
 
 @tilelang.testing.requires_cuda
 def test_infinity():
-    _test_infinity("float16")
-    _test_infinity("bfloat16")
-    _test_infinity("float32")
-    _test_infinity("float64")
+    _test_infinity(T.float16)
+    _test_infinity(T.bfloat16)
+    _test_infinity(T.float32)
+    _test_infinity(T.float64)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_int64.py b/testing/python/language/test_tilelang_language_int64.py
new file mode 100644
index 0000000000000000000000000000000000000000..d81e9dc6fab289d6dae760830136b86a2769dc78
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_int64.py
@@ -0,0 +1,66 @@
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit
+def fill_symbolic(value: float, dtype=T.bfloat16):
+    n = T.symbolic("n", "int64")
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_symbolic(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_symbolic(1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_symbolic():
+    # Requires 8GB VRAM
+    run_fill_symbolic(2**32)
+
+
+@tilelang.jit
+def fill_static(n: int, value: float, dtype=T.bfloat16):
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_static(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_static(n, 1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_static():
+    # Requires 8GB VRAM
+    run_fill_static(2**32)
+
+
+if __name__ == "__main__":
+    test_fill_symbolic()
+    test_fill_static()
diff --git a/testing/python/language/test_tilelang_language_intrinsics_codegen.py b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1d1e5401ee4216515e7875e491f48a933f3e50a
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
@@ -0,0 +1,30 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+
+    @T.prim_func
+    def main(
+        x: T.Tensor((N,), T.float32),
+        y: T.Tensor((N,), T.float32),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print(src)
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_lazy_jit.py b/testing/python/language/test_tilelang_language_lazy_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..505730965463d1d7431286222db596743f058351
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_lazy_jit.py
@@ -0,0 +1,418 @@
+from dataclasses import dataclass, field
+import tilelang.testing
+import tilelang
+import tilelang.language as T
+from typing import Any
+from itertools import product
+import torch
+
+
+def _gemm_impl():
+    @T.macro
+    def gemm_impl(
+        A: T.Tensor[[int, int], Any],
+        B: T.Tensor[[int, int], Any],
+        C: T.Tensor[[int, int], Any],
+        out_dtype: T.dtype,
+        block_M: int,
+        block_N: int,
+        block_K: int,
+    ):
+        dtype = A.dtype
+        M, K = A.shape
+        K, N = B.shape
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), out_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[bx * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[bx * block_M, by * block_N])
+
+    return gemm_impl
+
+
+def test_jit2_gemm_annot():
+    @tilelang.lazy_jit
+    def gemm(
+        A: T.Tensor[[int, int], Any],
+        B: T.Tensor[[int, int], Any],
+        out_dtype: T.dtype = T.float32,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 32,
+    ):
+        M, K = A.shape
+        K, N = B.shape
+        C = T.empty(M, N, dtype=out_dtype)
+        _gemm_impl()(A, B, C, out_dtype, block_M, block_N, block_K)
+        return C
+
+    prod = product([T.float16, T.float32], [T.float32])
+    gemm.par_compile(
+        [
+            {"A": T.Tensor((1024, 1024), dtype=in_dtype), "B": T.Tensor((1024, 1024), dtype=in_dtype), "out_dtype": out_dtype}
+            for in_dtype, out_dtype in prod
+        ]
+    )
+
+    for in_dtype, out_dtype in prod:
+        in_dtype = in_dtype.as_torch()
+        out_dtype = out_dtype.as_torch()
+        A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        C_ref = out_dtype(A @ B)
+        C = gemm(A, B)
+        torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)
+
+
+def test_jit2_gemm_ptr():
+    @tilelang.lazy_jit
+    def gemm_ptr(
+        A: T.ptr,
+        B: T.ptr,
+        C: T.ptr,
+        M: int,
+        N: int,
+        K: int,
+        dtype: T.dtype,
+        out_dtype: T.dtype,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 32,
+    ):
+        A = T.make_tensor(A, (M, K), dtype)
+        B = T.make_tensor(B, (K, N), dtype)
+        C = T.make_tensor(C, (M, N), out_dtype)
+        _gemm_impl()(A, B, C, out_dtype, block_M, block_N, block_K)
+
+    prod = product([T.float16, T.float32], [T.float32])
+    gemm_ptr.par_compile(
+        [
+            {"A": T.ptr(), "B": T.ptr(), "C": T.ptr(), "M": 1024, "N": 1024, "K": 1024, "dtype": in_dtype, "out_dtype": out_dtype}
+            for in_dtype, out_dtype in prod
+        ]
+    )
+    for in_dtype, out_dtype in prod:
+        in_dtype = in_dtype.as_torch()
+        out_dtype = out_dtype.as_torch()
+        A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        C_ref = out_dtype(A @ B)
+        C = torch.empty(1024, 1024, dtype=out_dtype, device="cuda")
+        gemm_ptr(A, B, C, 1024, 1024, 1024, in_dtype, out_dtype)
+        torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_annot():
+    from tilelang.language.v2.annot import Annot, ArgVarTable
+    from tilelang.language.v2.builder import Builder
+    import traceback
+
+    @dataclass
+    class AnnotTest:
+        annot: Annot
+        promote: Any
+        match_ok: list[Any] = field(default_factory=list)
+        match_ng: list[Any] = field(default_factory=list)
+
+    tests = [
+        AnnotTest(
+            annot=T.Tensor[[int, int], T.float32],
+            promote=False,
+            match_ok=[torch.randn(1, 1, dtype=torch.float32), T.Tensor((1, 1), dtype=T.float32)],
+            match_ng=[
+                torch.randn(1, 1, dtype=torch.float16),
+                T.Tensor(1, dtype=T.float32),
+                T.Tensor((1, 1), dtype=T.float16),
+            ],
+        ),
+        AnnotTest(
+            annot=T.Tensor[[int], Any],
+            promote=False,
+            match_ok=[
+                torch.randn(12, dtype=torch.float32),
+                torch.randn(12, dtype=torch.float16),
+                T.Tensor((1,), dtype=T.float32),
+                T.Tensor((1,), dtype=T.float16),
+            ],
+            match_ng=[torch.randn((1, 1), dtype=torch.float32), T.Tensor((1, 1), dtype=T.float16)],
+        ),
+        AnnotTest(
+            annot=T.Tensor[[int, 1], Any],
+            promote=False,
+            match_ok=[
+                torch.randn(12, 1, dtype=torch.float32),
+                torch.randn(12, 1, dtype=torch.float16),
+                T.Tensor((12, 1), T.float32),
+                T.Tensor((12, 1), T.float16),
+            ],
+            match_ng=[torch.randn(12, 12, dtype=torch.float32), T.Tensor((12, 12), T.float32)],
+        ),
+        AnnotTest(
+            annot=T.Tensor[[T.dyn, 1], Any],
+            promote=False,
+            match_ok=[
+                torch.randn(12, 1, dtype=torch.float32),
+                torch.randn(12, 1, dtype=torch.float16),
+                T.Tensor((12, 1), T.float32),
+                T.Tensor((12, 1), T.float16),
+            ],
+            match_ng=[torch.randn(12, 12, dtype=torch.float32), T.Tensor((12, 12), T.float32)],
+        ),
+        AnnotTest(
+            annot=T.Tensor[[1024, 1024], T.float32],
+            promote=True,
+        ),
+        AnnotTest(annot=T.dyn[int, "X"], promote=False, match_ok=[1, 2, 3, 4]),
+        AnnotTest(annot=T.dyn, promote=False, match_ok=[1, 2, 3, 4]),
+    ]
+
+    for test in tests:
+        promote = test.annot.promote()
+        promoted = promote is not None
+        if promoted != test.promote:
+            raise AssertionError(f"Promote mismatch for {test.annot}: expected {test.promote}, got {promoted}")
+        with Builder().prim_func("_test"):
+            for match_ok in test.match_ok:
+                try:
+                    vt = ArgVarTable()
+                    test.annot.create_prim_func_arg("arg", match_ok, vt)
+                except Exception as e:
+                    traceback.print_exc()
+                    raise AssertionError(f"Match failed for {test.annot} with value {match_ok}: {e}") from e
+            for match_ng in test.match_ng:
+                try:
+                    vt = ArgVarTable()
+                    test.annot.create_prim_func_arg("arg", match_ng, vt)
+                    raise AssertionError(f"Match unexpectedly succeeded for {test.annot} with value {match_ng}")
+                except Exception:
+                    pass
+
+
+def test_jit2_many_annot():
+    @T.macro
+    def copy_impl(A, B):
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+
+    @tilelang.lazy_jit
+    def copy1(
+        A: T.Tensor[[int, int], T.float32],
+        B: T.Tensor[[int, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy2(
+        A: T.Tensor[[128, 128], T.float32],
+        B: T.Tensor[[128, 128], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy3(
+        A: T.Tensor[[int, 128], T.float32],
+        B: T.Tensor[[int, 128], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy4(
+        A: T.Tensor[[T.dyn, int], T.float32],
+        B: T.Tensor[[T.dyn, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy5(
+        A: T.StridedTensor[[int, int], [int, int], T.float32],
+        B: T.StridedTensor[[int, int], [int, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy6(
+        A: T.StridedTensor[[T.dyn, int], [int, int], T.float32],
+        B: T.StridedTensor[[T.dyn, int], [int, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = torch.empty(128, 128, device="cuda")
+        copy(A, B)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = torch.randn(128, 2, 128, 2, device="cuda")
+        copy(A[:, 0, :, 0], B[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B[:, 0, :, 0])
+
+
+def test_jit2_return():
+    @T.macro
+    def copy_impl(A):
+        M, N = A.shape
+        B = T.empty(M, N, dtype=A.dtype)
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+        return B
+
+    @tilelang.lazy_jit
+    def copy0(A: T.Tensor[[int, int], Any]):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy1(
+        A: T.Tensor[[int, int], T.float32],
+    ):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy2(
+        A: T.Tensor[[128, 128], T.float32],
+    ):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy3(
+        A: T.Tensor[[int, 128], T.float32],
+    ):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy4(
+        A: T.Tensor[[T.dyn, int], T.float32],
+    ):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy5(
+        A: T.StridedTensor[[int, int], [int, int], T.float32],
+    ):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy6(
+        A: T.StridedTensor[[T.dyn, int], [int, int], T.float32],
+    ):
+        return copy_impl(A)
+
+    for copy in [copy0, copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = copy(A)
+        assert torch.equal(B, A)
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = copy(A[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B)
+
+
+def test_jit2_deepseek_deepgemm():
+    @tilelang.lazy_jit
+    def deep_gemm(
+        A: T.Tensor[[int, int], T.float8_e4m3fn],
+        B: T.Tensor[[int, int], T.float8_e4m3fn],
+        scales_a: T.Tensor[[int, int], T.float32],
+        scales_b: T.Tensor[[int, int], T.float32],
+        out_dtype: T.dtype = T.bfloat16,
+        accum_dtype: T.dtype = T.float32,
+        block_N: int = 128,
+        block_M: int = 128,
+        block_K: int = 128,
+    ):
+        # A: [M, K]
+        # B: [N, K]
+        # scales_a: [M, K // 128]
+        # scales_b: [N, K // 128]
+        # C: [M, N]
+
+        group_size = 128
+        in_dtype = A.dtype
+        M, K = A.shape
+        N, K = B.shape
+        C = T.empty(M, N, dtype=out_dtype)
+
+        assert out_dtype in [T.bfloat16, T.float32], f"Expect out_dtype to be one of [T.float16, T.float32], got {out_dtype}"
+        assert scales_a.shape == [M, T.ceildiv(K, group_size)], f"Expect scales_a shape to be f{[M, T.ceildiv(K, group_size)]}"
+        assert scales_b.shape == [N, T.ceildiv(K, group_size)], f"Expect scales_b shape to be f{[N, T.ceildiv(K, group_size)]}"
+
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), in_dtype)
+            B_shared = T.alloc_shared((block_N, block_K), in_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+            scale_C_shared = T.alloc_shared((block_M,), T.float32)
+            C_local = T.alloc_fragment((block_M, block_K), accum_dtype)
+            C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.use_swizzle(panel_size=10)
+
+            T.clear(C_local)
+            T.clear(C_local_accum)
+            K_iters = T.ceildiv(K, block_K)
+            for k in T.Pipelined(K_iters, num_stages=4):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                Scale_B = scales_b[bx * block_N // group_size, k]
+                for i in T.Parallel(block_M):
+                    scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+                for i, j in T.Parallel(block_M, block_N):
+                    C_local_accum[i, j] += C_local[i, j] * scale_C_shared[i]
+                T.clear(C_local)
+
+            T.copy(C_local_accum, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+        return C
+
+
+#     def ceildiv(a, b):
+#         return (a + b - 1) // b
+
+#     def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
+#         # A_scale: (M, K//128)       ==>   (M//128, K//128, 128)
+#         # B_scale: (N//128, K//128)  ==>   (N//128, K//128, 128)
+#         # A_fp8: (M, K)
+#         # B_fp8: (N, K)
+#         # out_dtype: float16 or float32
+#         # return C: (M, N)
+#         M, N, K = A_fp8.shape[0], B_fp8.shape[0], A_fp8.shape[1]
+#         A_scales = A_scale.view(M // 128, 128, K // 128).permute(0, 2, 1)
+#         B_scales = B_scale.repeat_interleave(128, dim=1).view(N // 128, K // 128, 128)
+#         C = torch.zeros(M, N, device="cuda", dtype=out_dtype)
+#         c_acc = torch.zeros(128, 128, device="cuda", dtype=torch.float32)
+#         for i in range(ceildiv(M, 128)):
+#             for j in range(ceildiv(N, 128)):
+#                 c_acc.zero_()
+#                 for k in range(ceildiv(K, 128)):
+#                     c = torch._scaled_mm(
+#                         A_fp8[i * 128:(i + 1) * 128, k * 128:(k + 1) * 128],
+#                         B_fp8[j * 128:(j + 1) * 128, k * 128:(k + 1) * 128].T,
+#                         scale_a=A_scales[i, k].view(128, 1).contiguous(),
+#                         scale_b=B_scales[j, k].view(1, 128).contiguous(),
+#                         out_dtype=torch.bfloat16)
+#                     c_acc += c.to(torch.float32)
+#                 C[i * 128:(i + 1) * 128, j * 128:(j + 1) * 128] = c_acc.to(out_dtype)
+#         return C
+
+#     M, N, K = 1024, 1024, 8192
+#     A = torch.randn((M, K), dtype=torch.float8_e4m3fn, )
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_let.py b/testing/python/language/test_tilelang_language_let.py
index a2af09c672fc32d177f9ca10120363c9d9b0da86..6f94ad66493769928be0526d2a3f60b08057ebae 100644
--- a/testing/python/language/test_tilelang_language_let.py
+++ b/testing/python/language/test_tilelang_language_let.py
@@ -4,10 +4,9 @@ from tilelang import language as T
 
 
 def test_let_vectorize_load():
-
     @T.prim_func
     def main(A_ptr: T.handle):
-        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
 
         for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
             for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
diff --git a/testing/python/language/test_tilelang_language_let_layout.py b/testing/python/language/test_tilelang_language_let_layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..fec30b914b2f8075c18fbd305382802b37686dd3
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_let_layout.py
@@ -0,0 +1,123 @@
+"""
+Test layout inference for LetStmt expressions.
+
+This test validates that TileLang correctly handles layout inference when
+fragment buffer accesses occur through let bindings. For example:
+
+    block_mask_f = T.alloc_fragment((N_S,), T.int32)
+    T.copy(BlockMask[by, :], block_mask_f)
+    for i in T.Pipelined(N_S):
+        a = block_mask_f[i]  # LetStmt: a is bound to fragment buffer load
+        T.copy(A[a, 0], A_shared)  # a is used as index in TMA copy
+
+Key scenarios tested:
+1. Fragment buffer layout inference through let bindings
+2. TMA (Tensor Memory Accelerator) copy with let-bound indices
+3. CP.ASYNC copy with let-bound indices
+4. Warp specialization with let-bound fragment accesses
+"""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def blocksparse_copy_kernel(M, N, N_S, block_M, block_N, dtype=T.float16):
+    """BlockSparse copy kernel using fragment for block mask indices."""
+    block_mask_shape = (M // block_M, N_S)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)
+            block_mask_f = T.alloc_fragment((N_S,), T.int32)
+
+            T.clear(B_shared)
+            T.copy(BlockMask[by, :], block_mask_f)
+            for i in T.Pipelined(N_S):
+                a = block_mask_f[i]  # LetStmt: fragment buffer access
+                if a >= 0:
+                    T.copy(A[a, 0], A_shared)
+                    T.copy(A_shared, B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N])
+
+    return main
+
+
+def ref_blocksparse_copy(A, B, BlockMask, M, N, N_S, block_M, block_N):
+    """Reference implementation for blocksparse copy."""
+    ref_B = B.clone()
+    num_row_blocks = M // block_M
+
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            src_row_start = BlockMask[by, i].item()
+            ref_B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N] = A[
+                src_row_start : src_row_start + block_M, 0:block_N
+            ]
+
+    return ref_B
+
+
+def run_blocksparse_copy(M, N, block_M, block_N, pass_configs=None):
+    """Run blocksparse copy test with given parameters."""
+    N_S = N // block_N
+
+    program = blocksparse_copy_kernel(M, N, N_S, block_M, block_N)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs=pass_configs or {},
+    )
+
+    # Initialize tensors
+    a = torch.randn(M, N, device="cuda", dtype=torch.float16)
+    b = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+
+    # Create BlockMask with valid row indices
+    num_row_blocks = M // block_M
+    block_mask = torch.zeros((num_row_blocks, N_S), dtype=torch.int32, device="cuda")
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            max_row_block = (M - block_M) // block_M
+            block_mask[by, i] = torch.randint(0, max_row_block + 1, (1,)).item() * block_M
+
+    # Run kernel
+    c = kernel(a, block_mask)
+
+    # Compute reference
+    ref_c = ref_blocksparse_copy(a, b, block_mask, M, N, N_S, block_M, block_N)
+
+    # Verify
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_tma():
+    """Test blocksparse copy with TMA (Tensor Memory Accelerator)."""
+    run_blocksparse_copy(M=1024, N=1024, block_M=128, block_N=128, pass_configs={})
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_cp_async():
+    """Test blocksparse copy with CP.ASYNC (without TMA)."""
+    run_blocksparse_copy(
+        M=1024,
+        N=1024,
+        block_M=128,
+        block_N=128,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index ad90785f4994fdced27c3b7945fcdf1aa86e88f7..8f899729133357cd7cfe322a122ffeef079ac2ee 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -5,12 +5,11 @@ import torch
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -27,16 +26,11 @@ def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -48,12 +42,11 @@ def test_tilelang_copy_mask_parallel():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -69,16 +62,11 @@ def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -90,12 +78,11 @@ def test_tilelang_copy_mask_copy():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -112,20 +99,11 @@ def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_parallel_range(M=1024,
-                                          N=1024,
-                                          block_M=128,
-                                          block_N=128,
-                                          dtype="float16"):
+def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -137,12 +115,11 @@ def test_tilelang_copy_mask_parallel_range():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -158,16 +135,11 @@ def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_negative_index.py b/testing/python/language/test_tilelang_language_negative_index.py
index 4a0df878b9b828215b44a3fee75b1445b701c59b..feeed2c6fdd89573686cae588d20bfbc0490dd7b 100644
--- a/testing/python/language/test_tilelang_language_negative_index.py
+++ b/testing/python/language/test_tilelang_language_negative_index.py
@@ -1,38 +1,37 @@
 from tilelang import tvm
 import tilelang as tl
 import tilelang.testing
-from tvm.script import tir as T
+import tilelang.language as T
 
 
 @T.prim_func
-def negative_index_before(A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
+def negative_index_before(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     B[0] = A[T.int32(-1)]
 
 
 @T.prim_func
-def negative_index_expected(A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
+def negative_index_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     B[0] = A[T.int32(15)]
 
 
 @T.prim_func
-def negative_index_loop_before(A: T.Buffer((16,), "float32"), B: T.Buffer((4,), "float32")):
+def negative_index_loop_before(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(4):
         B[i] = A[-i - 1]
 
 
 @T.prim_func
-def negative_index_loop_expected(A: T.Buffer((16,), "float32"), B: T.Buffer((4,), "float32")):
+def negative_index_loop_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(4):
         B[i] = A[15 - i]
 
 
 @T.prim_func
-def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), "float32"),
-                                   B: T.Buffer((16,), "float32")):
+def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(16):
         B[i] = A[shift + i]
diff --git a/testing/python/language/test_tilelang_language_parallel.py b/testing/python/language/test_tilelang_language_parallel.py
index b51ca8b680ceb87a0d972cfc9e7c848ff04d2c2e..a392e70b687ce892ba4455dd5385f9b8f07f06f0 100644
--- a/testing/python/language/test_tilelang_language_parallel.py
+++ b/testing/python/language/test_tilelang_language_parallel.py
@@ -8,12 +8,11 @@ tilelang.testing.set_random_seed()
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_static(length=256, dtype="float32"):
-
+def parallel_elementwise_static(length=256, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length):
@@ -23,13 +22,12 @@ def parallel_elementwise_static(length=256, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_dynamic(max_len=512, threads=256, dtype="float32"):
-
+def parallel_elementwise_dynamic(max_len=512, threads=256, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((max_len,), dtype),
-            B: T.Tensor((max_len,), dtype),
-            valid_len: T.int32,
+        A: T.Tensor((max_len,), dtype),
+        B: T.Tensor((max_len,), dtype),
+        valid_len: T.int32,
     ):
         with T.Kernel(1, threads=threads) as _:
             for i in T.Parallel(max_len):
diff --git a/testing/python/language/test_tilelang_language_pipeline.py b/testing/python/language/test_tilelang_language_pipeline.py
index 212f281ea9bebd1d771133d432acaef84b5f1ef9..8136e246f0e48b2697a13376f773172333ce83cb 100644
--- a/testing/python/language/test_tilelang_language_pipeline.py
+++ b/testing/python/language/test_tilelang_language_pipeline.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -23,13 +24,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -63,9 +62,9 @@ def run_gemm(
     block_K = 32
     trans_A = False
     trans_B = False
-    in_dtype = "float16"
-    out_dtype = "float16"
-    dtypeAccum = "float32"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
     num_threads = 128
     program = matmul(
         M,
@@ -90,7 +89,8 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -100,11 +100,11 @@ def run_gemm(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -124,27 +124,19 @@ def test_pipeline_order_stage():
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+    },
+)
+def blocksparse_matmul(M, N, K, block_M, block_N, block_K, num_stages, dtype=T.float16, accum_dtype=T.float32):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     import tilelang.language as T
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -183,8 +175,7 @@ def run_blocksparse_matmul(num_stages):
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
 
-    kernel = blocksparse_matmul(
-        M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
+    kernel = blocksparse_matmul(M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
     print(kernel.get_kernel_source())
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
@@ -200,12 +191,10 @@ def run_blocksparse_matmul(num_stages):
                 accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
                 for k in range(K // block_K):
                     if BlockMask[i, j, k]:
-                        accu += (
-                            A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                                torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                                   j * block_N:(j + 1) * block_N].to(torch.float32))
-                ref_c[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                        accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                            k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                        ].to(torch.float32)
+                ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
         return ref_c
 
     # Compute the reference result using the naive PyTorch implementation
diff --git a/testing/python/language/test_tilelang_language_ptr.py b/testing/python/language/test_tilelang_language_ptr.py
index e4659ecc580697b36f9c69b93fdd167a873a3b97..85458139a5529deda0e0b341516bcfea8559b96d 100644
--- a/testing/python/language/test_tilelang_language_ptr.py
+++ b/testing/python/language/test_tilelang_language_ptr.py
@@ -6,8 +6,7 @@ import tilelang.language as T
 from tilelang.utils import map_torch_type
 
 
-def matmul_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         a_ptr: T.ptr,
@@ -40,7 +39,7 @@ def matmul_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
     jit_kernel = tl.compile(program, target="cuda", execution_backend="cython")
 
diff --git a/testing/python/language/test_tilelang_language_rand.py b/testing/python/language/test_tilelang_language_rand.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf51dbb7f84c575a5a1ffc9d12ecf954b9ab311
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_rand.py
@@ -0,0 +1,37 @@
+import tilelang
+import tilelang.language as T
+import torch
+import pytest
+import tilelang.testing
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+
+    return rand_kernel
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("M, seed", [(1024, 42), (512, 123), (128, 0)])
+def test_rand_1d(M, seed):
+    kernel = tilelang_rand_1d(M, seed)
+    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(tilelang_result)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_reduce.py b/testing/python/language/test_tilelang_language_reduce.py
index cecfaa097c7aee740b395ff933b5f4e2ccd4ad2d..1d9bf61303d356a215ba7d42245a00c705de3504 100644
--- a/testing/python/language/test_tilelang_language_reduce.py
+++ b/testing/python/language/test_tilelang_language_reduce.py
@@ -1,17 +1,16 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import tilelang.language as T
 
 tilelang.testing.set_random_seed()
 
 
 def _make_shared_reduce(M, N, dtype, reduce_cb):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((M, N), dtype)
@@ -30,13 +29,13 @@ def _run_program(program, ref_program, atol=1e-2, rtol=1e-2):
     profiler.assert_allclose(ref_program, atol=atol, rtol=rtol)
 
 
-def reduce_max_test(M, N, dtype="float16"):
+def reduce_max_test(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -49,13 +48,13 @@ def reduce_max_test(M, N, dtype="float16"):
     return main
 
 
-def reduce_sum_test(M, N, dtype="float32"):
+def reduce_sum_test(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -68,27 +67,27 @@ def reduce_sum_test(M, N, dtype="float32"):
     return main
 
 
-def reduce_sum_ss(M, N, dtype="float32"):
+def reduce_sum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_sum(src, dst, dim=1))
 
 
-def reduce_max_ss(M, N, dtype="float32"):
+def reduce_max_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_max(src, dst, dim=1))
 
 
-def reduce_min_ss(M, N, dtype="float32"):
+def reduce_min_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_min(src, dst, dim=1))
 
 
-def reduce_abssum_ss(M, N, dtype="float32"):
+def reduce_abssum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_abssum(src, dst, dim=1))
 
 
-def reduce_absmax_ss(M, N, dtype="float32"):
+def reduce_absmax_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_absmax(src, dst, dim=1))
 
 
-def run_reduce_sum(M, N, dtype="float32", mode="rr"):
+def run_reduce_sum(M, N, dtype=T.float32, mode="rr"):
     if mode == "rr":
         program = reduce_sum_test(M, N, dtype)
     elif mode == "ss":
@@ -98,12 +97,12 @@ def run_reduce_sum(M, N, dtype="float32", mode="rr"):
     _run_program(program, lambda A: A.sum(dim=1))
 
 
-def run_shared_reduce(program_builder, ref_program, M, N, dtype="float32"):
+def run_shared_reduce(program_builder, ref_program, M, N, dtype=T.float32):
     program = program_builder(M, N, dtype)
     _run_program(program, ref_program)
 
 
-def run_reduce_max(M, N, dtype="float16"):
+def run_reduce_max(M, N, dtype=T.float16):
     program = reduce_max_test(M, N, dtype)
     _run_program(program, lambda A: A.max(dim=1).values, atol=1e-2, rtol=1e-2)
 
@@ -119,34 +118,34 @@ def test_reduce_sum_shared():
 
 
 def test_reduce_max():
-    run_reduce_max(256, 256, "float16")
-    run_reduce_max(512, 128, "float16")
-    run_reduce_max(256, 256, "float32")
+    run_reduce_max(256, 256, T.float16)
+    run_reduce_max(512, 128, T.float16)
+    run_reduce_max(256, 256, T.float32)
 
 
 def test_reduce_max_shared():
-    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_min_shared():
-    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_abssum_shared():
-    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, "float32")
+    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, T.float32)
 
 
 def test_reduce_absmax_shared():
-    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, T.float32)
 
 
-def reduce_sum_test_clear(M, N, dtype="float32"):
+def reduce_sum_test_clear(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -160,7 +159,7 @@ def reduce_sum_test_clear(M, N, dtype="float32"):
     return main
 
 
-def run_reduce_sum_clear(M, N, dtype="float32"):
+def run_reduce_sum_clear(M, N, dtype=T.float32):
     program = reduce_sum_test_clear(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
@@ -176,18 +175,18 @@ def run_reduce_sum_clear(M, N, dtype="float32"):
 
 
 def test_reduce_sum_clear():
-    run_reduce_sum_clear(256, 256, "float32")
-    run_reduce_sum_clear(512, 128, "float32")
-    run_reduce_sum_clear(128, 512, "float32")
+    run_reduce_sum_clear(256, 256, T.float32)
+    run_reduce_sum_clear(512, 128, T.float32)
+    run_reduce_sum_clear(128, 512, T.float32)
 
 
-def reduce_max_test_clear(M, N, dtype="float16"):
+def reduce_max_test_clear(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -201,7 +200,7 @@ def reduce_max_test_clear(M, N, dtype="float16"):
     return main
 
 
-def run_reduce_max_clear(M, N, dtype="float16"):
+def run_reduce_max_clear(M, N, dtype=T.float16):
     program = reduce_max_test_clear(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
@@ -217,7 +216,7 @@ def run_reduce_max_clear(M, N, dtype="float16"):
 
 
 def test_reduce_max_clear():
-    run_reduce_max_clear(256, 256, "float16")
+    run_reduce_max_clear(256, 256, T.float16)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index c510bdd3aabea0c5c16be041e5ea319f848c3a5c..10c3d0ce87b76b85e18946fa2569f7a12dec3433 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -1,16 +1,15 @@
-from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+from tilelang import language as T
 import torch
+import pytest
 
 
 def reshape_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_reshaped = T.reshape(A, [N // M, M])
@@ -29,7 +28,8 @@ def run_reshape(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -40,17 +40,15 @@ def run_reshape(N, M, dtype):
 
 def test_reshape_smem():
     # Test reshape
-    run_reshape(1024, 32, "float32")
-    run_reshape(2048, 64, "float16")
+    run_reshape(1024, 32, T.float32)
+    run_reshape(2048, 64, T.float16)
 
 
 def reshape_test_smem_1d_2_2d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N,), dtype)
@@ -73,7 +71,8 @@ def run_reshape_smem_1d_2_2d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -83,17 +82,15 @@ def run_reshape_smem_1d_2_2d(N, M, dtype):
 
 
 def test_reshape_smem_1d_2_2d():
-    run_reshape_smem_1d_2_2d(1024, 32, "float32")
-    run_reshape_smem_1d_2_2d(2048, 64, "float16")
+    run_reshape_smem_1d_2_2d(1024, 32, T.float32)
+    run_reshape_smem_1d_2_2d(2048, 64, T.float16)
 
 
 def reshape_test_smem_2d_2_1d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N // M, M), dtype)
@@ -116,7 +113,8 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -126,17 +124,15 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
 
 
 def test_reshape_smem_2d_2_1d():
-    run_reshape_smem_2d_2_1d(1024, 32, "float32")
-    run_reshape_smem_2d_2_1d(2048, 64, "float16")
+    run_reshape_smem_2d_2_1d(1024, 32, T.float32)
+    run_reshape_smem_2d_2_1d(2048, 64, T.float16)
 
 
 def reshape_fragment_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
@@ -160,7 +156,8 @@ def run_reshape_fragment(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -170,25 +167,26 @@ def run_reshape_fragment(N, M, dtype):
 
 
 def test_reshape_fragment():
-    run_reshape_fragment(1024, 32, "float32")
-    run_reshape_fragment(2048, 64, "float16")
+    run_reshape_fragment(1024, 32, T.float32)
+    run_reshape_fragment(2048, 64, T.float16)
 
 
 def reshape_layout_transform_shared(N, M, dtype):
-    import tilelang.language as T
     from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout
 
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
 
-            T.annotate_layout({
-                A_shared: make_mma_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_mma_swizzle_layout(A_shared),
+                }
+            )
             T.copy(A, A_shared)
             A_shared_reshape = T.reshape(A_shared, [N])
             T.copy(A_shared_reshape, B)
@@ -204,7 +202,8 @@ def run_reshape_layout_transform_shared(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -214,17 +213,15 @@ def run_reshape_layout_transform_shared(N, M, dtype):
 
 
 def test_reshape_layout_transform_shared():
-    run_reshape_layout_transform_shared(1024, 32, "float32")
-    run_reshape_layout_transform_shared(2048, 64, "float16")
+    run_reshape_layout_transform_shared(1024, 32, T.float32)
+    run_reshape_layout_transform_shared(2048, 64, T.float16)
 
 
 def reduce_after_reshape_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_shared = T.alloc_shared((N,), dtype, scope="shared")
@@ -248,7 +245,8 @@ def run_reduce_after_reshape(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -258,8 +256,26 @@ def run_reduce_after_reshape(N, M, dtype):
 
 
 def test_reduce_after_reshape():
-    run_reduce_after_reshape(1024, 32, "float32")
-    run_reduce_after_reshape(2048, 64, "float16")
+    run_reduce_after_reshape(1024, 32, T.float32)
+    run_reduce_after_reshape(2048, 64, T.float16)
+
+
+def reshape_shape_mismatch_test(N, M, dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_reshaped = T.reshape(A, [N // M, M + 1])
+            T.copy(A_reshaped, B)
+
+    return main
+
+
+def test_reshape_shape_mismatch():
+    with pytest.raises(AssertionError):
+        reshape_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_ternary.py b/testing/python/language/test_tilelang_language_ternary.py
index 821231ab40021fbaa00f84c49ed0e39ca6fc7fd4..20c7b5e778ce8f0bb76aa1250fa3e8861b321619 100644
--- a/testing/python/language/test_tilelang_language_ternary.py
+++ b/testing/python/language/test_tilelang_language_ternary.py
@@ -4,24 +4,24 @@ import torch
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
-def tilelang_ternary(M, N, block_M, block_N, dtype="float16"):
-
+@tilelang.jit(
+    out_idx=[1],
+)
+def tilelang_ternary(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = (
-                    A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0)
+                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0
 
     return main
 
 
-def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_ternary(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_tma_1d.py b/testing/python/language/test_tilelang_language_tma_1d.py
index efb665ba345b47b5af9c900c9335b6abbdfec751..9cb79c10c65c893c52e750c8458bb5397dd62e68 100644
--- a/testing/python/language/test_tilelang_language_tma_1d.py
+++ b/testing/python/language/test_tilelang_language_tma_1d.py
@@ -9,10 +9,8 @@ def ref_program(x, y):
 
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -21,7 +19,7 @@ def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
@@ -36,7 +34,7 @@ def run_elementwise_add(M, N):
     # Default config
     block_M, block_N = 128, 128
     config = {"block_M": block_M, "block_N": block_N, "threads": 128}
-    kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+    kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
 
     out = kernel(a, b)
     torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
new file mode 100644
index 0000000000000000000000000000000000000000..06367e975e59c82e03c1c5309a4707ca5f62602f
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -0,0 +1,35 @@
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang import language as T
+
+
+def test_unroll_with_step():
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, step=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll" in kernel.get_kernel_source()
+
+
+def test_unroll_with_unroll_factor():
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, unroll_factor=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll 4" in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_var_init.py b/testing/python/language/test_tilelang_language_var_init.py
index a5a7ddeda85b3f0c0602ec2a4d26931e2f28847c..36d9bf01419205756571978b331df9f62833a668 100644
--- a/testing/python/language/test_tilelang_language_var_init.py
+++ b/testing/python/language/test_tilelang_language_var_init.py
@@ -4,17 +4,15 @@ import tilelang.testing
 
 
 def test_var_assign() -> None:
-
     @tilelang.jit(out_idx=-1)
     def jit_kernel():
-
         @T.prim_func
-        def test_var_assign(A: T.Tensor((2,), 'int32')):
+        def test_var_assign(A: T.Tensor((2,), T.int32)):
             with T.Kernel(1) as _:
-                a = T.alloc_var('int32', init=1)
-                b = T.alloc_var('int32', init=a)  # b gets value of a
+                a = T.alloc_var(T.int32, init=1)
+                b = T.alloc_var(T.int32, init=a)  # b gets value of a
                 a = 2
-                d = T.alloc_var('int32', init=a)  # c gets new value of a
+                d = T.alloc_var(T.int32, init=a)  # c gets new value of a
                 A[0] = b
                 A[1] = d
 
@@ -28,5 +26,5 @@ def test_var_assign() -> None:
     assert res[1] == 2
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index cee8b5a630a78156ae0770bce91c63052cffdd6d..75360bb19c48fe90b42ec35efd4057da8e193a81 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -5,12 +5,10 @@ import tilelang.language as T
 
 @tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
 def vectorize_test(N, M, stride_A, stride_B):
-    assert N % 128 == 0 and M % 128 == 0
-
     @T.prim_func
     def main(
-            A: T.StridedTensor[(N, M), (1, stride_A), "float32"],  # noqa: F821
-            B: T.StridedTensor[(N, M), (1, stride_B), "float32"],  # noqa: F821
+        A: T.StridedTensor[(N, M), (1, stride_A), T.float32],  # noqa: F821
+        B: T.StridedTensor[(N, M), (1, stride_B), T.float32],  # noqa: F821
     ):
         with T.Kernel(M // 128, threads=128) as (bx):
             tx = T.get_thread_binding(0)
@@ -23,6 +21,7 @@ def vectorize_test(N, M, stride_A, stride_B):
 
 
 def run_vectorize(N, M, stride_A, stride_B):
+    assert N % 128 == 0 and M % 128 == 0
     assert stride_A >= N and stride_B >= N
 
     jit_kernel = vectorize_test(N, M, stride_A, stride_B)
@@ -39,9 +38,7 @@ def run_vectorize(N, M, stride_A, stride_B):
     code = jit_kernel.get_kernel_source()
 
     vectorize_size = 1
-    while vectorize_size <= 2 and \
-          stride_A % (vectorize_size * 2) == 0 and \
-          stride_B % (vectorize_size * 2) == 0:
+    while vectorize_size <= 2 and stride_A % (vectorize_size * 2) == 0 and stride_B % (vectorize_size * 2) == 0:
         vectorize_size *= 2
 
     if vectorize_size == 4:
@@ -59,5 +56,61 @@ def test_vectorize():
     run_vectorize(N, M, N + 8, N + 16)
 
 
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
+def vectorize_test_invariant_index(N, M, K):
+    @T.prim_func
+    def main(
+        A: T.Tensor[(N, M), T.float32],  # noqa: F821
+        B: T.Tensor[(N, M), T.float32],  # noqa: F821
+        C: T.Tensor[(N, M // K), T.float32],  # noqa: F821
+    ):
+        with T.Kernel(N // 128, threads=128) as (bx):
+            tx = T.get_thread_binding(0)
+            row = bx * 128 + tx
+
+            for col in T.vectorized(M):
+                B[row, col] = A[row, col] * C[row, col // K]
+
+    return main
+
+
+def run_vectorize_invariant_index(N, M, K):
+    assert N % 128 == 0 and M % K == 0
+
+    jit_kernel = vectorize_test_invariant_index(N, M, K)
+
+    a = torch.randn(N, M, device="cuda", dtype=torch.float32)
+    b = torch.zeros(N, M, device="cuda", dtype=torch.float32)
+    c = torch.randn(N, M // K, device="cuda", dtype=torch.float32)
+
+    jit_kernel(a, b, c)
+
+    indices = torch.arange(a.size(1)) // K
+    ret = a * c[:, indices]
+    torch.testing.assert_close(b, ret, atol=1e-8, rtol=1e-8)
+
+    code = jit_kernel.get_kernel_source()
+
+    vectorize_size = 1
+    while vectorize_size <= 2 and K % (vectorize_size * 2) == 0:
+        vectorize_size *= 2
+
+    if vectorize_size == 4:
+        assert "float4" in code
+    elif vectorize_size == 2:
+        assert "float2" in code
+
+
+def test_vectorize_invariant_index():
+    N, M = 512, 256
+
+    run_vectorize_invariant_index(N, M, 2)
+    run_vectorize_invariant_index(N, M, 4)
+    run_vectorize_invariant_index(N, M * 3, 6)
+    run_vectorize_invariant_index(N, M, 8)
+    run_vectorize_invariant_index(N, M * 3, 12)
+    run_vectorize_invariant_index(N, M * 7, 14)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index afb8a05d3e280e351a919a80281cc4a8f2a631f4..1a0a0942a25068b69596393741305fd441b1935a 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -1,13 +1,14 @@
+import pytest
 import torch
 import tilelang.testing
 import tilelang.language as T
 
 str2dtype = {
-    "float32": torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "float8_e5m2": torch.float8_e5m2,
+    T.float32: torch.float32,
+    T.float16: torch.float16,
+    T.bfloat16: torch.bfloat16,
+    T.float8_e4m3fn: torch.float8_e4m3fn,
+    T.float8_e5m2: torch.float8_e5m2,
 }
 
 
@@ -17,8 +18,8 @@ def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
 
     @T.prim_func
     def main(
-            A: T.Tensor[(M,), dtype_A],  # noqa: F821
-            B: T.Tensor[(M,), dtype_B],  # noqa: F821
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
     ):
         with T.Kernel(1, threads=128):
             T.copy(A, B)
@@ -32,8 +33,8 @@ def parallel_vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
 
     @T.prim_func
     def main(
-            A: T.Tensor[(M,), dtype_A],  # noqa: F821
-            B: T.Tensor[(M,), dtype_B],  # noqa: F821
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
     ):
         with T.Kernel(1, threads=128):
             A_local = T.alloc_fragment((M,), dtype_A)
@@ -60,9 +61,10 @@ def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str,
     kernel = vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
     kernel_parallel = parallel_vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
 
-    A = torch.randn(M, dtype=str2dtype[src_dtype_str]).cuda()
-    B = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
-    C = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
+    A_float = torch.randn(M, dtype=torch.float32, device="cuda")
+    A = A_float.to(str2dtype[src_dtype_str])
+    B = torch.zeros(M, dtype=str2dtype[dst_dtype_str], device="cuda")
+    C = torch.zeros(M, dtype=str2dtype[dst_dtype_str], device="cuda")
 
     kernel(A, B)
     kernel_parallel(A, C)
@@ -73,34 +75,32 @@ def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str,
     code = kernel.get_kernel_source()
     code_parallel = kernel_parallel.get_kernel_source()
 
-    assert check_str in code and check_str in code_parallel, \
-        f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
-
-
-def test_vectorized_cast():
-    # fp32 -> fp16
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 2)
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 4)
-
-    # fp16 -> fp32
-    run_vectorized_cast("float16", "float32", "__half22float2", 2)
-    run_vectorized_cast("float16", "float32", "__half22float2", 4)
-
-    # fp32 -> fp8_e4m3
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4)
-
-    # fp32 -> fp8_e5m2
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4)
-
-    # fp32 -> bf16
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 2)
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 4)
-
-    # bf16 -> fp32
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 2)
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 4)
+    assert check_str in code and check_str in code_parallel, f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
+
+
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        (T.float32, T.float16, "__float22half2_rn", 2),
+        (T.float32, T.float16, "__float22half2_rn", 4),
+        (T.float16, T.float32, "__half22float2", 2),
+        (T.float16, T.float32, "__half22float2", 4),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 2),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 4),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 2),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 4),
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+    ],
+)
+def test_vectorized_cast(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_view.py b/testing/python/language/test_tilelang_language_view.py
index c16c51852e9c1e2e269f2b6e726e40143ffddd58..dc4c3711b26aa3f180089027ab55f3ad496339e5 100644
--- a/testing/python/language/test_tilelang_language_view.py
+++ b/testing/python/language/test_tilelang_language_view.py
@@ -1,14 +1,15 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import pytest
 
 
 def view_test(N, M, dtype, new_dtype=None):
-    import tilelang.language as T
-
     new_shape = [N // M, M]
     if new_dtype:
         from tvm import DataType
+
         dtype_src = DataType(dtype)
         dtype_dst = DataType(new_dtype)
         src_bits = dtype_src.bits
@@ -18,8 +19,8 @@ def view_test(N, M, dtype, new_dtype=None):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
     ):
         with T.Kernel(1) as _:
             A_viewed = T.view(A, new_shape, dtype=new_dtype)
@@ -35,8 +36,7 @@ def run_view(N, M, dtype, new_dtype=None):
 
     def ref_program(A):
         if new_dtype:
-            from tilelang.utils.tensor import map_torch_type
-            torch_dtype = map_torch_type(new_dtype)
+            torch_dtype = T.dtype(new_dtype).as_torch()
             return A.view(N // M, M).view(dtype=torch_dtype)
         return A.view(N // M, M)
 
@@ -44,14 +44,42 @@ def run_view(N, M, dtype, new_dtype=None):
 
 
 def test_reshape_view():
-
     # Test view with same dtype
-    run_view(1024, 32, "float32")
-    run_view(2048, 64, "float16")
+    run_view(1024, 32, T.float32)
+    run_view(2048, 64, T.float16)
 
     # Test view with dtype conversion
-    run_view(1024, 32, "float32", "float16")
-    run_view(2048, 64, "float16", "float32")
+    run_view(1024, 32, T.float32, T.float16)
+    run_view(2048, 64, T.float16, T.float32)
+
+
+def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
+    new_shape = [N // M, M + 1]
+    if new_dtype:
+        from tvm import DataType
+
+        dtype_src = DataType(dtype)
+        dtype_dst = DataType(new_dtype)
+        src_bits = dtype_src.bits
+        dst_bits = dtype_dst.bits
+        scale = src_bits / dst_bits
+        new_shape[-1] = int(M * scale)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_viewed = T.view(A, new_shape, dtype=new_dtype)
+            T.copy(A_viewed, B)
+
+    return main
+
+
+def test_view_shape_mismatch():
+    with pytest.raises(AssertionError):
+        view_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_warp_reduce.py b/testing/python/language/test_tilelang_language_warp_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8868013d24e32ba724ef5a76eb005d6aea42061
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_warp_reduce.py
@@ -0,0 +1,82 @@
+import torch
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def get_kernel(reduce_op: str, dtype: str):
+    assert reduce_op in ["sum", "max", "min", "bitand", "bitor"]
+
+    @T.prim_func
+    def main(x: T.Tensor((32), dtype)):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding(0)
+            local_val = T.alloc_local([1], dtype)
+            local_val[0] = x[tx]
+            reduced_val = T.alloc_local([1], dtype)
+            if reduce_op == "sum":
+                reduced_val[0] = T.warp_reduce_sum(local_val[0])
+            elif reduce_op == "max":
+                reduced_val[0] = T.warp_reduce_max(local_val[0])
+            elif reduce_op == "min":
+                reduced_val[0] = T.warp_reduce_min(local_val[0])
+            elif reduce_op == "bitand":
+                reduced_val[0] = T.warp_reduce_bitand(local_val[0])
+            elif reduce_op == "bitor":
+                reduced_val[0] = T.warp_reduce_bitor(local_val[0])
+            x[tx] = reduced_val[0]
+
+    return main
+
+
+def test_warp_reduce_sum():
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("sum", T.float32)
+    ref = torch.full_like(a, a.sum())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_max():
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("max", T.float32)
+    print(kernel.get_kernel_source())
+    ref = torch.full_like(a, a.max())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_min():
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("min", T.float32)
+    ref = torch.full_like(a, a.min())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_bitand():
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitand", T.int32)
+    ref_val = a[0]
+    for i in range(1, a.shape[0]):
+        ref_val = ref_val & a[i]
+    ref = torch.full_like(a, ref_val)
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_bitor():
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitor", T.int32)
+    ref_val = a[0]
+    for i in range(1, a.shape[0]):
+        ref_val = ref_val | a[i]
+    ref = torch.full_like(a, ref_val)
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_fused_replicate.py b/testing/python/layout/test_tilelang_layout_fused_replicate.py
index d67a87bc3bf165a128891a880fea68d0ae06544c..8aa5f6c42ec0bed605caebdc52242aee8534cece 100644
--- a/testing/python/layout/test_tilelang_layout_fused_replicate.py
+++ b/testing/python/layout/test_tilelang_layout_fused_replicate.py
@@ -12,19 +12,18 @@ VEC_SIZE = 32
 
 @tilelang.jit
 def fused_index_kernel(B: int, M: int, N: int, BLOCK_MN: int, BLOCK_K: int):
-
     @T.prim_func
     def main(
-            a: T.Buffer((B, M, N), "bfloat16"),
-            a_out: T.Buffer((B, M, N), "float32"),
+        a: T.Buffer((B, M, N), T.bfloat16),
+        a_out: T.Buffer((B, M, N), T.float32),
     ):
         with T.Kernel(
-                T.ceildiv(M, BLOCK_MN),
-                T.ceildiv(N, BLOCK_K),
-                B,
-                threads=128,
+            T.ceildiv(M, BLOCK_MN),
+            T.ceildiv(N, BLOCK_K),
+            B,
+            threads=128,
         ) as (pid_m, pid_n, pid_b):
-            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), "float32")
+            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), T.float32)
             offs_m = pid_m * BLOCK_MN
             offs_n = pid_n * BLOCK_K
 
diff --git a/testing/python/math/test_math_bitwise_reduce.py b/testing/python/math/test_math_bitwise_reduce.py
index 9c22946692a510d202051b85ef51d721f63d9f3a..044e0ea376c842e661afac288b77b3b39412e70b 100644
--- a/testing/python/math/test_math_bitwise_reduce.py
+++ b/testing/python/math/test_math_bitwise_reduce.py
@@ -19,18 +19,17 @@ def bitwise_reduce(
     func,
     clear=True,
 ):
-
     @T.prim_func
     def reduce_func(
-            A: T.Tensor((M, N), "int32"),
-            B: T.Tensor((M), "int32"),
-            Output: T.Tensor((M), "int32"),
+        A: T.Tensor((M, N), T.int32),
+        B: T.Tensor((M), T.int32),
+        Output: T.Tensor((M), T.int32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_N), "int32")
-            A_fragment = T.alloc_fragment((block_M, block_N), "int32")
-            B_shared = T.alloc_shared((block_M,), "int32")
-            B_fragment = T.alloc_fragment((block_M), "int32")
+            A_shared = T.alloc_shared((block_M, block_N), T.int32)
+            A_fragment = T.alloc_fragment((block_M, block_N), T.int32)
+            B_shared = T.alloc_shared((block_M,), T.int32)
+            B_fragment = T.alloc_fragment((block_M), T.int32)
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(A_shared, A_fragment)
             T.copy(B[by * block_M], B_shared)
@@ -64,7 +63,7 @@ def run_single_bitwise_reduce(
             row_pattern = (i & 0xF) << (i % 4)  # 4-bit patterns shifted by row
 
             # Column-based pattern: different bit positions set based on column
-            col_pattern = (1 << (j % 31))  # Single bit set at different positions
+            col_pattern = 1 << (j % 31)  # Single bit set at different positions
 
             # Combine patterns with XOR to create diverse bit distributions
             # Add some deterministic "noise" based on position
@@ -76,7 +75,7 @@ def run_single_bitwise_reduce(
             if i % 4 == 0:
                 a[i, j] &= ~(0x1 << (i // 4))
             elif i % 2 == 0:
-                a[i, j] |= (0x1 << (i // 2))
+                a[i, j] |= 0x1 << (i // 2)
 
     if name == "reduce_bitand":
         expected = torch.full((M,), -1, device="cuda", dtype=torch.int32)
diff --git a/testing/python/math/test_math_fast_math.py b/testing/python/math/test_math_fast_math.py
index c3b5d1b5288ef09252426869a5816268e991052d..3c50e95f4c27ca3715cbf0088972e521e5f06740 100644
--- a/testing/python/math/test_math_fast_math.py
+++ b/testing/python/math/test_math_fast_math.py
@@ -7,16 +7,16 @@ import re
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +27,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +49,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +57,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +71,8 @@ def run_single_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +85,22 @@ def run_single_arg_mathop_test(mathop_name,
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +109,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +119,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -145,7 +133,7 @@ def run_two_arg_mathop_test(mathop_name,
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -171,8 +159,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +172,8 @@ def run_abs_test():
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +188,19 @@ def run_abs_test():
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,18 +209,19 @@ def run_fastmath_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -290,7 +273,7 @@ def test_mathops_generate_no_fastmath():
     ]
 
     for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
+        run_single_arg_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
@@ -304,7 +287,7 @@ def test_two_arg_mathops_fastmath():
     ]
 
     for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+        run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -329,7 +312,7 @@ def test_fastmath_versions():
     ]
 
     for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
+        run_fastmath_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
diff --git a/testing/python/math/test_math_ieee_math.py b/testing/python/math/test_math_ieee_math.py
index 0b04e3bab1b8962b0eca2dbaa8033bc706c64ead..5d49880027e3c61ef4e597c5e3168a699e73fd81 100644
--- a/testing/python/math/test_math_ieee_math.py
+++ b/testing/python/math/test_math_ieee_math.py
@@ -5,14 +5,7 @@ import tilelang.testing
 import pytest
 
 
-def run_ieee_math_test(mathop_name,
-                       mathop_func,
-                       rounding_mode="rn",
-                       M=128,
-                       N=128,
-                       block_M=32,
-                       block_N=32,
-                       dtype="float32"):
+def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test IEEE-compliant math operations with specified rounding modes.
     """
@@ -22,18 +15,19 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
-                D: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+            D: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    D[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i, bx * block_N + j],
-                                                      C[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    D[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j],
+                        B[by * block_M + i, bx * block_N + j],
+                        C[by * block_M + i, bx * block_N + j],
+                        rounding_mode,
+                    )
 
         out_idx = [3]
         num_inputs = 3
@@ -41,16 +35,15 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    C[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    C[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j], rounding_mode
+                    )
 
         out_idx = [2]
         num_inputs = 2
@@ -58,14 +51,12 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    B[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      rounding_mode)
+                    B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j], rounding_mode)
 
         out_idx = [1]
         num_inputs = 1
@@ -77,13 +68,14 @@ def run_ieee_math_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print(f"\n=== Testing {mathop_name} with rounding mode {rounding_mode} ===")
     print(f"✓ {mathop_name} compilation test passed")
 
     # Test numerical execution
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     if num_inputs >= 2:
@@ -194,8 +186,8 @@ def test_ieee_frsqrt_rn_only():
 
     @T.prim_func
     def main(
-            A: T.Tensor((128, 128), "float32"),
-            B: T.Tensor((128, 128), "float32"),
+        A: T.Tensor((128, 128), T.float32),
+        B: T.Tensor((128, 128), T.float32),
     ):
         with T.Kernel(T.ceildiv(128, 32), T.ceildiv(128, 32), threads=128) as (bx, by):
             for i, j in T.Parallel(32, 32):
@@ -207,7 +199,8 @@ def test_ieee_frsqrt_rn_only():
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print("\n=== Testing ieee_frsqrt (rn only) ===")
     print("✓ ieee_frsqrt compilation test passed")
diff --git a/testing/python/metal/test_metal_codegen.py b/testing/python/metal/test_metal_codegen.py
index 22f4beb89b8fe26a58d3f467504e6e84ae935196..5349bbec58e6dad10066d7447b4462a50426e6f1 100644
--- a/testing/python/metal/test_metal_codegen.py
+++ b/testing/python/metal/test_metal_codegen.py
@@ -5,18 +5,17 @@ import tilelang.language as T
 import torch
 
 
-@tilelang.jit(execution_backend='torch')
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float32", accum_dtype="float"):
-
+@tilelang.jit(execution_backend="torch")
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float32, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype, scope='shared')
-            B_shared = T.alloc_shared((block_K, block_N), dtype, scope='shared')
+            A_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared")
+            B_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
@@ -40,21 +39,21 @@ def assert_gemm(
     block_M,
     block_N,
     block_K,
-    dtype="float32",
-    accum_dtype="float",
+    dtype=T.float32,
+    accum_dtype=T.float32,
     atol=1e-8,
 ):
     jit_kernel = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
 
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a, b = None, None
-    if 'int' in dtype:
-        a = torch.randint(100, (M, K), dtype=torch_dtype, device='mps')
-        b = torch.randint(100, (K, N), dtype=torch_dtype, device='mps')
+    if "int" in dtype:
+        a = torch.randint(100, (M, K), dtype=torch_dtype, device="mps")
+        b = torch.randint(100, (K, N), dtype=torch_dtype, device="mps")
     else:
-        a = torch.randn(M, K, dtype=torch_dtype, device='mps')
-        b = torch.randn(K, N, dtype=torch_dtype, device='mps')
-    c = torch.zeros(M, N, dtype=torch_dtype, device='mps')
+        a = torch.randn(M, K, dtype=torch_dtype, device="mps")
+        b = torch.randn(K, N, dtype=torch_dtype, device="mps")
+    c = torch.zeros(M, N, dtype=torch_dtype, device="mps")
 
     jit_kernel(a, b, c)
 
@@ -70,12 +69,12 @@ def test_gemm_float32():
 
 @tilelang.testing.requires_metal
 def test_gemm_float16():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='float16', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.float16, atol=1)
 
 
 @tilelang.testing.requires_metal
 def test_gemm_int32():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='int32', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.int32, atol=1)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/primitives/test_tilelang_primitives_mma.py b/testing/python/primitives/test_tilelang_primitives_mma.py
deleted file mode 100644
index fcda9878cd6a15ee046cef5db64d27299c416156..0000000000000000000000000000000000000000
--- a/testing/python/primitives/test_tilelang_primitives_mma.py
+++ /dev/null
@@ -1,379 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.testing
-from tilelang import primitives as P
-
-
-def matmul_ssr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                P.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_ssr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_ssr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    # TODO(lei): gemm_v2 with tma is not fully tested.
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
-
-
-def test_gemm_f16f16f16_nt_ssr():
-    run_matmul_ssr(
-        16, 16, 16, False, True, "float16", "float16", "float16", 16, 16, 16, 0, num_threads=32)
-    run_matmul_ssr(
-        128, 128, 128, False, True, "float16", "float16", "float16", 32, 32, 32, 0, num_threads=64)
-    run_matmul_ssr(
-        1024,
-        1024,
-        1024,
-        False,
-        True,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        128,
-        32,
-        2,
-        num_threads=128)
-
-
-def matmul_rsr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                T.copy(A_shared, A_local)
-                P.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-                # T.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rsr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rsr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# TODO(lei): Fix the test case in future release
-# Now it has some bugs related to is_m_first
-# def test_gemm_f16f16f16_nt_rsr():
-#     run_matmul_rsr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         0,
-#         num_threads=128,
-#     )
-
-
-def matmul_rrr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    B_local_shape = B_shared_shape
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            B_local = T.alloc_fragment(B_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                    T.copy(A_shared, A_local)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(A_shared, A_local)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.copy(B_shared, B_local)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                    T.copy(B_shared, B_local)
-                P.gemm(A_local, B_local, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rrr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rrr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# def test_gemm_f16f16f16_nt_rrr():
-#     run_matmul_rrr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         2,
-#     )
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/profiler/test_tilelang_profiler.py b/testing/python/profiler/test_tilelang_profiler.py
index ee46725b9b64568940dc43a386342241aafd1a27..09d894c599ad84362645628abf0a651a8d73c0ad 100644
--- a/testing/python/profiler/test_tilelang_profiler.py
+++ b/testing/python/profiler/test_tilelang_profiler.py
@@ -3,13 +3,12 @@ import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..083373eb7b32ba7f29099daafaff4bf133230797
--- /dev/null
+++ b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.jit
+def dynamic_smem_kernel():
+    # Symbolic length to drive dynamic shared memory allocation
+    length = T.symbolic("len", dtype=T.int32)  # noqa: F821
+
+    @T.prim_func
+    def main(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
+        # Launch a simple kernel that copies from global memory into shared memory
+        # using a dynamically-sized allocation. No writes back to global_tensor.
+        with T.Kernel(1, threads=32) as _:
+            buffer_shared = T.alloc_shared((length,), dtype=T.int32)  # noqa: F821
+            T.copy(buffer_shared, global_tensor)
+
+    return main
+
+
+def _require_cuda_tensor(shape, dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randint(0, 100, shape, dtype=dtype, device="cuda")
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+def _run_and_check(kernel, n):
+    a = _require_cuda_tensor((n,), torch.int32)
+    kernel(a)
+    torch.cuda.synchronize()
+
+
+def test_dynamic_shared_memory_varies_across_calls():
+    kernel = dynamic_smem_kernel()
+
+    # Run with different dynamic shared memory sizes across invocations
+    _run_and_check(kernel, 100)
+    _run_and_check(kernel, 200)
+    # Repeat sizes to exercise attribute caching path
+    _run_and_check(kernel, 200)
+    _run_and_check(kernel, 100)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
index d984ad4bcdd910db47b708e8a71f2ef626d7e7b6..67123cb8c0f57e0122ecc16941120bed75204003 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -1,3 +1,4 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
 import pytest
@@ -23,13 +24,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -89,7 +88,8 @@ def run_gemm_ss(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
@@ -108,30 +108,27 @@ def run_gemm_ss(
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
-def test_gemm_ss():
-    # More test case can be found in kernel/test_tilelang_kernel_gemm.py
-    # GEMM tests for float16
-    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    # n8 test
-    run_gemm_ss(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 test
-    run_gemm_ss(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # tfloat32 test
-    run_gemm_ss(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rs(
@@ -155,13 +152,11 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -169,9 +164,11 @@ def matmul_rs(
             A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -225,7 +222,8 @@ def run_gemm_rs(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -243,30 +241,27 @@ def run_gemm_rs(
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
-def test_gemm_rs():
-    # GEMM tests for float16
-    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_rs(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_rs(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_rs(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_rs(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_sr(
@@ -290,13 +285,11 @@ def matmul_sr(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -304,9 +297,11 @@ def matmul_sr(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -360,7 +355,8 @@ def run_gemm_sr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -377,31 +373,27 @@ def run_gemm_sr(
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_sr():
-    # GEMM tests for float16
-    run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_sr(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_sr(128, 128, 32, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    # TODO(lei): fix in future
-    run_gemm_sr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 32, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rr(
@@ -430,9 +422,9 @@ def matmul_rr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -441,10 +433,12 @@ def matmul_rr(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -499,7 +493,8 @@ def run_gemm_rr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -516,31 +511,29 @@ def run_gemm_rr(
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_rr():
-    # GEMM tests for float16
-    run_gemm_rr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
-    # n8 tests
-    run_gemm_rr(128, 8, 128, False, True, "float16", "float16", "float16", 128, 8, 32, 2)
-    run_gemm_rr(128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 32, 2)
-
-    # int8 tests
-    run_gemm_rr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_rr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float, 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 32, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index 74b9729f682c93bef2c7a613e2679f3ffd3ee483..b0f4a29c9216d0f3eddc384a3ab070cfcb44afcf 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -1,29 +1,33 @@
+import pytest
 import torch
 import tilelang
 import tilelang.testing
-
-from tilelang.utils.sparse import compress, randn_semi_sparse
-from tilelang.layout import make_metadata_layout
-
-torch.set_printoptions(threshold=float('inf'), edgeitems=float('inf'), linewidth=10000)
-torch.manual_seed(42)
-
-STR_TO_TYPE = {
-    'float32': torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "int8": torch.int8,
-    "int32": torch.int32,
-}
-
-SPARSITY_MAP = {
-    # 'float32': (1, 2),  # not supported for now
-    torch.float16: (2, 4),
-    torch.bfloat16: (2, 4),
-    torch.float8_e4m3fn: (2, 4),
-    torch.int8: (2, 4),
-}
+import tilelang.language as T
+
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+torch.backends.cuda.matmul.allow_tf32 = False
+# torch.manual_seed(42)  # only enable when debugging
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+    else:
+        A = randn_semi_sparse(M, K, dtype=torch.float32, device="cuda", transposed=trans_A).to(map_torch_type(in_dtype))
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
 
 
 def matmul_sp_sm90(
@@ -41,40 +45,32 @@ def matmul_sp_sm90(
     trans_A,
     trans_B,
 ):
-    E_factor = 4 if in_dtype == "float32" else 8
+    E_factor = 4 if in_dtype == T.float32 else 8
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor), 'uint8')
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
-            })
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), "uint8")
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                }
+            )
             T.disable_warp_group_reg_alloc()
-            T.clear(C_local)
+            T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
                 if trans_A:
@@ -85,8 +81,8 @@ def matmul_sp_sm90(
                     T.copy(B[bx * block_N, k * block_K], B_shared)
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm_sp(A_shared, E_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
 
     return main
 
@@ -107,34 +103,31 @@ def matmul_sp_sm80(
     trans_B,
 ):
     is_8_bit = "8" in in_dtype
-    E_factor = 32 if is_8_bit else 16
+    metadata_dtype = T.int32 if is_8_bit else T.int16
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'int32' if is_8_bit else 'int16'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor),
-                                      'int32' if is_8_bit else 'int16')
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(E, mma_dtype="float16", backend="cutlass", arch="8.0"),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared, mma_dtype="float16", backend="cutlass", arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -181,19 +174,14 @@ def run_gemm_sp(
         kernel,
         out_idx=[-1],
     )
-    A = randn_semi_sparse(M, K, dtype=STR_TO_TYPE[in_dtype], device='cuda', transposed=trans_A)
-    if trans_B:
-        B = torch.randn((N, K), device='cuda', dtype=torch.float32)
-    else:
-        B = torch.randn((K, N), device='cuda', dtype=torch.float32)
-
-    if "float8" in in_dtype or "int8" in in_dtype:
-        A = normalize(A.float())
-        B = normalize(B.float())
-
-    A = A.to(STR_TO_TYPE[in_dtype])
-    B = B.to(STR_TO_TYPE[in_dtype])
-
+    A, B = generate_dense_input(
+        M=M,
+        N=N,
+        K=K,
+        trans_A=trans_A,
+        trans_B=trans_B,
+        in_dtype=in_dtype,
+    )
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
 
     C_sp = kernel(A_sparse, E, B)
@@ -206,17 +194,27 @@ def run_gemm_sp(
         if "float8" in in_dtype or "int8" in in_dtype:
             A = A.to(torch.float32)
             B = B.to(torch.float32)
-        return torch.matmul(A, B).to(STR_TO_TYPE[out_dtype])
+        return torch.matmul(A, B)
 
     C = _matmul(A, B)
-    if 'float8' in in_dtype:
+
+    if "float8" in in_dtype:
         diff = calc_diff(C_sp, C)
         assert diff < 1e-3, f"{diff=}"
     else:
-        torch.testing.assert_close(C_sp, C, atol=1e-3, rtol=1e-3)
+        torch_assert_close(
+            C_sp.to(torch.float32),
+            C.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-3,
+            base_name="tilelang_sp",
+            ref_name="ref_dense",
+        )
     print("pass")
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
 def run_gemm_sp_sm90(
     M,
     N,
@@ -229,8 +227,8 @@ def run_gemm_sp_sm90(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm90(
         M,
@@ -260,6 +258,9 @@ def run_gemm_sp_sm90(
     )
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 0)
+@tilelang.testing.requires_cuda_compute_version_le(8, 9)
 def run_gemm_sp_sm80(
     M,
     N,
@@ -272,8 +273,8 @@ def run_gemm_sp_sm80(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm80(
         M,
@@ -305,57 +306,51 @@ def run_gemm_sp_sm80(
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9, 0)
-def test_gemm_sp_sm90():
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 2, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 0, 256)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     False)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     True)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False,
-                     True)
-    run_gemm_sp_sm90(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 0, 256, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float8_e4m3fn, T.float16, T.float16, 64, 64, 64, 2, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(8, 0)
 @tilelang.testing.requires_cuda_compute_version_le(8, 9)
-def test_gemm_sp_sm80():
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 32, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 1, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 3, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 32, 32, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 128, 128, 128, 0, 128, False, True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 1, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 3, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 32, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 1, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 3, 128, False, False),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 128, 128, 128, 0, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 1, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d232902c68cdce11eed26ddd9cd9874cff9e25d
--- /dev/null
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -0,0 +1,633 @@
+import pytest
+from tilelang import tvm as tvm
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+import tilelang.testing
+import torch
+import tilelang.language as T
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_ss(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+    else:
+        A = randn_semi_sparse(M, K, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int32, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_rs(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rs(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_rs(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_sr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_shared, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_sr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_sr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_rr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_rr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 64, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_nullable_buffer_params.py b/testing/python/transform/test_nullable_buffer_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bbde254b5adab8d27de1708d37460a748f0bcd9
--- /dev/null
+++ b/testing/python/transform/test_nullable_buffer_params.py
@@ -0,0 +1,73 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_nullable_shared_shape():
+    """Test that buffers sharing a shape variable can be nullable."""
+
+    @tilelang.jit
+    def get_kernel():
+        m = T.dynamic("m")
+
+        @T.prim_func
+        def test_kernel(
+            a: T.Tensor[(m,), T.int32],
+            b: T.Tensor[(m,), T.int32],
+            c: T.Tensor[(m,), T.int32],
+        ):
+            with T.Kernel(1, threads=64):
+                tx = T.get_thread_binding()
+                if tx == 0:
+                    T.print(m)
+
+        return test_kernel
+
+    m = 200
+    kernel = get_kernel()
+
+    # Create test tensors
+    tensor_a = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_b = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_c = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+
+    print("Test 1: All tensors provided")
+    kernel(tensor_a, tensor_b, tensor_c)
+    print("✓ PASS: All tensors provided")
+
+    print("\nTest 2: Only first tensor provided")
+    kernel(tensor_a, None, None)
+    print("✓ PASS: Only first tensor provided")
+
+    print("\nTest 3: Only middle tensor provided")
+    kernel(None, tensor_b, None)
+    print("✓ PASS: Only middle tensor provided")
+
+    print("\nTest 4: Only last tensor provided")
+    kernel(None, None, tensor_c)
+    print("✓ PASS: Only last tensor provided")
+
+    print("\nTest 5: First and last tensors provided")
+    kernel(tensor_a, None, tensor_c)
+    print("✓ PASS: First and last tensors provided")
+
+    print("\nTest 6: All tensors are None (should fail)")
+    try:
+        kernel(None, None, None)
+        print("✗ FAIL: Should have raised an error")
+        return False
+    except RuntimeError as e:
+        if "at least one non-null buffer" in str(e):
+            print(f"✓ PASS: Correctly rejected with error: {e}")
+        else:
+            print(f"✗ FAIL: Wrong error message: {e}")
+            return False
+
+    print("\n" + "=" * 60)
+    print("All tests passed!")
+    return True
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_readonly_param_const_codegen.py b/testing/python/transform/test_readonly_param_const_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d255b46b4f083565b7b7e52347ce5841d400990
--- /dev/null
+++ b/testing/python/transform/test_readonly_param_const_codegen.py
@@ -0,0 +1,54 @@
+import tilelang.language as T
+from tilelang.engine.lower import lower
+from tilelang.jit.adapter.utils import match_declare_kernel
+
+
+def _simple_add_kernel():
+    @T.prim_func
+    def main(
+        x: T.Tensor((128,), T.float32),
+        y: T.Tensor((128,), T.float32),
+    ):
+        # One-dimensional kernel; writes y from x without modifying x
+        with T.Kernel(128, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    return main
+
+
+def test_codegen_emits_const_for_readonly_params():
+    # Lower without device compilation to retrieve CUDA source reliably
+    func = _simple_add_kernel()
+    artifact = lower(func, target="cuda", enable_device_compile=False)
+
+    src = artifact.kernel_source
+    print(src)
+    assert 'extern "C" __global__' in src
+
+    # Extract kernel signature and check qualifiers
+    lparen = match_declare_kernel(src)
+    rparen = src.find(")", lparen)
+    assert rparen != -1
+    signature = src[lparen:rparen]
+
+    # x is read-only: should be `const` and `__restrict__`
+    assert "const float* __restrict__" in signature
+    # y is written: must not be const, but still `__restrict__` due to noalias
+    # We ensure there is a non-const float* parameter with __restrict__ as well
+    assert "const float* __restrict__ x" in src or "const float *__restrict__ x" in src
+    assert " float* __restrict__ y" in src or " float *__restrict__ y" in src
+
+    # Also validate the function attribute carries read-only param indices
+    # Expect only the first handle parameter (x) to be marked read-only
+    device_mod = artifact.device_mod
+    prim_funcs = [f for f in device_mod.functions.values() if hasattr(f, "attrs")]
+    assert prim_funcs, "No PrimFunc found in device module"
+    pf = prim_funcs[0]
+    ro = pf.attrs.get("tl.readonly_param_indices")
+    assert ro is not None, "Expected tl.readonly_param_indices to be present"
+    ro_list = [int(i) for i in ro]
+    assert 0 in ro_list and 1 not in ro_list
+
+
+if __name__ == "__main__":
+    test_codegen_emits_const_for_readonly_params()
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index c0444043de447fea93d7b12dff9154bfb03d2edb..cdff6fb1d32ac50286f766c291d131832fcd13e0 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -10,26 +10,19 @@ def _check(original, transformed):
     mod = tl.transform.InjectSoftwarePipeline()(mod)
     mod = tl.transform.Simplify()(mod)
     mod = tl.transform.LowerOpaqueBlock()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    mod = tl.transform.Simplify()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_trival_pipeline():
-
     @T.prim_func
-    def before(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")):
+    def before(A: T.Tensor((16, 1), T.float32), C: T.Tensor((16, 1), T.float32)):
         for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
-            for i in T.serial(
-                    0,
-                    1,
-                    annotations={
-                        "software_pipeline_stage": [0, 1],
-                        "software_pipeline_order": [0, 1]
-                    }):
+            for i in T.serial(0, 1, annotations={"software_pipeline_stage": [0, 1], "software_pipeline_order": [0, 1]}):
                 with T.block():
                     T.reads(A[tx, i])
                     T.writes(C[tx, i])
-                    B = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                    B = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
                     with T.block():
                         T.reads(A[tx, i])
                         T.writes(B[tx, 0])
diff --git a/testing/python/transform/test_tilelang_transform_cluster_planning.py b/testing/python/transform/test_tilelang_transform_cluster_planning.py
index 8029305aec94cd4f787940323b626fdd953193ba..296c6ce947243d7a78f825950b8e4b46c2c3a0ad 100644
--- a/testing/python/transform/test_tilelang_transform_cluster_planning.py
+++ b/testing/python/transform/test_tilelang_transform_cluster_planning.py
@@ -21,14 +21,12 @@ def _check(original, transformed):
 
 
 def test_cluster_planning():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def before(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -41,13 +39,12 @@ def test_cluster_planning():
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def after(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         T.func_attr({"clusterIdx.y": T.int32(2)})
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
diff --git a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
index f051f0282235c781b7e8f35292227bafd03d05af..559b2ffb4392fe4449ed3e03357ead2293f30a16 100644
--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -9,7 +9,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
 
     batch = T.int32(batch)
     heads = T.int32(heads)
@@ -19,12 +19,11 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    dtype = T.bfloat16
+    accum_dtype = T.float32
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -36,41 +35,42 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def MMA1(
-                V: T.Tensor(shape, dtype),
-                V_shared: T.Tensor([block_M, dim], dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                k: T.int32,
-                by: T.int32,
-                bz: T.int32,
+            V: T.Tensor(shape, dtype),
+            V_shared: T.Tensor([block_M, dim], dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            k: T.int32,
+            by: T.int32,
+            bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.Tensor([block_M, block_N], accum_dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                scores_max: T.Tensor([block_M], accum_dtype),
-                scores_max_prev: T.Tensor([block_M], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
-                scores_sum: T.Tensor([block_M], accum_dtype),
-                logsum: T.Tensor([block_M], accum_dtype),
+            acc_s: T.Tensor([block_M, block_N], accum_dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            scores_max: T.Tensor([block_M], accum_dtype),
+            scores_max_prev: T.Tensor([block_M], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
+            scores_sum: T.Tensor([block_M], accum_dtype),
+            logsum: T.Tensor([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -90,22 +90,21 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
         @T.macro
         def Rescale(
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -120,7 +119,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -129,19 +128,18 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                            scores_sum, logsum)
+                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                     Rescale(acc_o, scores_scale)
                     MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
index 2859821cafa694067c9b13ac92862a5df0c9996b..533a62fc683332c4f13ba29db9716c26a45374e1 100644
--- a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
+++ b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -22,46 +22,50 @@ def _check(original, transformed):
 
 
 def test_lower_fence_proxy():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+            )
 
     @T.prim_func
     def after():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
             T.fence_proxy_async()
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+            )
 
     _check(before, after)
 
 
 def test_async_to_generic_no_double_fence():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
-            B_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
+            A_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
+            B_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
             T.ptx_cp_async("uint8", A_shared.data, 0, B_shared.data, 0, 16)
             T.fence_proxy_async()
             T.call_extern("handle", "generic_op")
@@ -90,7 +94,6 @@ def test_async_to_generic_no_double_fence():
 
 
 def test_proxy_hint_override():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -123,11 +126,10 @@ def test_proxy_hint_override():
 
 
 def test_tma_store_sync_injection():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_global = T.decl_buffer((128,), "float16", scope="global")
+            A_global = T.decl_buffer((128,), T.float16, scope="global")
             T.evaluate(T.call_intrin("handle", tir.op.Op.get("tl.tma_store"), A_global.data))
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
@@ -154,19 +156,33 @@ def test_tma_store_sync_injection():
 
 
 def test_wgmma_marked_async():
-
     @T.prim_func
     def before():
         with T.Kernel(1):
-            A_shared = T.decl_buffer((1,), "float16", scope="shared")
-            desc_a = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            desc_b = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            C_local = T.decl_buffer((32,), "float16", scope="local")
+            A_shared = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
             A_shared[0] = T.float16(0)
             T.warpgroup_arrive()
-            T.ptx_wgmma_ss("float16", "m64n64k16", T.bool(True), T.bool(True), "fp16", "fp16",
-                           "fp16", desc_a.data, T.int32(0), desc_b.data, T.int32(0), C_local.data,
-                           T.int32(0), T.bool(True), 1, 1)
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
     mod = tvm.tir.transform.BindTarget(auto_target)(mod)
diff --git a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
index 95cbf2db5d4eebea0294c7851229547fe60b42ab..1885c7c4b31a3d493fc8ab5c44be28db4bc27554 100644
--- a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
+++ b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
@@ -9,7 +9,7 @@ def test_inject_set_max_nreg():
     """Test the InjectSetMaxNReg pass"""
 
     @T.prim_func
-    def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16")):
+    def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 128)
@@ -22,8 +22,8 @@ def test_inject_set_max_nreg():
             T.annotate_producer_reg_dealloc(24)  # Producer: decrease to 24
             T.annotate_consumer_reg_alloc(240)  # Consumer: increase to 240
 
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
 
             T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
@@ -35,26 +35,25 @@ def test_inject_set_max_nreg():
                     T.mbarrier_wait_parity(T.get_mbarrier(k % 3 + 3), T.bitwise_xor(k // 3 % 2, 1))
                     if v - 128 == 0:
                         T.tma_load(
-                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1,
-                                                    0, 2, 2, 0), T.get_mbarrier(k % 3),
-                            T.tvm_access_ptr(
-                                T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                            k * 32, by * 64)
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
+                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                            T.get_mbarrier(k % 3),
+                            T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                            k * 32,
+                            by * 64,
+                        )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
             else:
                 # Consumer branch - should have set_max_nreg(240, 1)
                 for k in range(16):
                     T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
                     T.call_extern(
-                        "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                        "handle",
+                        "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                    )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
     # Apply the InjectSetMaxNReg pass
     func = before
@@ -67,15 +66,18 @@ def test_inject_set_max_nreg():
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # We should have at least 2 set_max_nreg calls (one for producer, one for consumer)
-    assert len(set_max_nreg_calls
-              ) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg test passed!")
 
@@ -84,7 +86,7 @@ def test_inject_set_max_nreg_no_set_max_nreg():
     """Test the InjectSetMaxNReg pass with no_set_max_nreg"""
 
     @T.prim_func
-    def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
+    def before_no_set_max_nreg(A: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         v = T.launch_thread("threadIdx.x", 128)
 
@@ -116,16 +118,18 @@ def test_inject_set_max_nreg_no_set_max_nreg():
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # Should have no set_max_nreg calls when no_set_max_nreg is present
-    assert len(
-        set_max_nreg_calls
-    ) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg with no_set_max_nreg test passed!")
 
diff --git a/testing/python/transform/test_tilelang_transform_layout_inference.py b/testing/python/transform/test_tilelang_transform_layout_inference.py
index 66415aacb950db5059d15c48aead5788019ccfe5..82fcd19ab932e63f476f9fc20ee859a4f1bdc40a 100644
--- a/testing/python/transform/test_tilelang_transform_layout_inference.py
+++ b/testing/python/transform/test_tilelang_transform_layout_inference.py
@@ -8,17 +8,21 @@ import pytest
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, T.float16),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
     def before():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
@@ -26,58 +30,62 @@ def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
                         for vec in T.Parallel(vec_load_b):
-                            B_shared[i * (threads * vec_load_b // block_N) + t //
-                                     (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                     (block_N // vec_load_b) + vec] = T.if_then_else(
-                                         k * block_K + i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b) < K and bx * block_N + t %
-                                         (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                         B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                           t // (block_N // vec_load_b), bx * block_N + t %
-                                           (block_N // vec_load_b) * (block_N // vec_load_b) + vec],
-                                         T.float16(0))
+                            B_shared[
+                                i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                            ] = T.if_then_else(
+                                k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                B[
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ],
+                                T.float16(0),
+                            )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     def after():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     with tvm.target.Target(auto_target):
         mod = tvm.tir.transform.BindTarget(auto_target)(before())
@@ -94,4 +102,4 @@ def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    test_loop_tail_split(64, 64, 32, 128, 8, "float16")
+    test_loop_tail_split(64, 64, 32, 128, 8, T.float16)
diff --git a/testing/python/transform/test_tilelang_transform_legalize_negative_index.py b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..26c151141f8132e17f81199d14838c9a0d9c9075
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
@@ -0,0 +1,342 @@
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+
+
+def _check(original, expected):
+    """Helper function to verify structural equality after transformations"""
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tl.transform.LegalizeNegativeIndex()(mod)
+    expected = tvm.IRModule.from_expr(expected.with_attr("global_symbol", "main"))
+    tvm.ir.assert_structural_equal(mod["main"], expected["main"], True)
+
+
+def test_buffer_load_negative_index_legalized():
+    """
+    Test that negative indices are legalized by adding buffer extent.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        value = A[-1]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        value = A[1023]  # A[-1] becomes A[1023]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices - only negative ones are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        value = A[-1, 10]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        value = A[1023, 10]  # A[-1, 10] becomes A[1023, 10]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
+        value = A[-1, -2, -3]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
+        value = A[1023, 510, 253]  # -1+1024=1023, -2+512=510, -3+256=253
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
+        for i in T.serial(1, 1024):
+            value = A[-i]
+            B[-i] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
+        for i in T.serial(1, 1024):
+            value = A[1024 - i]
+            B[1024 - i] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_non_negative_index_unchanged():
+    """
+    Test that non-negative indices remain unchanged.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        value = A[0]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # No changes expected for non-negative indices
+        value = A[0]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_unknown_sign_index_warning():
+    """
+    Test that indices with unknown sign trigger warnings but are processed.
+    This test mainly checks that the pass doesn't crash on unknown signs.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
+        value = A[i]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
+        # Unknown sign indices should remain unchanged
+        value = A[i]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Broadcast(-1, 4)
+        value = A[vec]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        value = A[T.Broadcast(1023, 4)]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        value = A[vec]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        value = A[T.Ramp(1020, 1, 4)]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_nested_buffer_loads():
+    """
+    Test legalization with nested buffer load expressions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        inner_val = A[-1, 10]
+        outer_val = A[inner_val.astype(T.int32), -2]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = outer_val
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        inner_val = A[1023, 10]
+        outer_val = A[inner_val.astype(T.int32), 510]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = outer_val
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index():
+    """
+    Test negative indices in buffer store operations are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        A[-1] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        A[1023] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        A[-1, 10] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        A[1023, 10] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions for buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
+        A[-1, -2, -3] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
+        A[1023, 510, 253] = 42.0  # -1+1024=1023, -2+512=510, -3+256=253
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        for i in T.serial(1, 1024):
+            A[-i] = i * 2.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        for i in T.serial(1, 1024):
+            A[1024 - i] = i * 2.0
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized store operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Broadcast(-1, 4)
+        values = T.Broadcast(42.0, 4)
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        values = T.Broadcast(42.0, 4)
+        A[T.Broadcast(1023, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized store operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        values = T.Ramp(0.0, 1.0, 4)  # values: [0.0, 1.0, 2.0, 3.0]
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        values = T.Ramp(0.0, 1.0, 4)
+        A[T.Ramp(1020, 1, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_nested_in_condition():
+    """
+    Test negative index buffer store within conditional statements.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32), flag: T.int32):
+        if flag > 0:
+            A[-1] = 42.0
+        else:
+            A[-2] = 24.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32), flag: T.int32):
+        if flag > 0:
+            A[1023] = 42.0
+        else:
+            A[1022] = 24.0
+
+    _check(before, after)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index 5202ab6475f7a384278d1249fb6b23553a480f65..4f75fa05d00f1c001efffd6a309500f4205b1e76 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -5,10 +5,12 @@ import tilelang.testing
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -16,17 +18,18 @@ def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_off
                 A_shared[tid, j] = A[tid + M_offset, j + N_offset]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
 
     return main, expected
 
@@ -38,45 +41,13 @@ def assert_vectorize_access(M: int = 64, N: int = 64):
     tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
 
 
-def issue_1013_buggy_kernel():
-    # NOTE: This kernel is mainly to test some corner cases in boundary check
+def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
+    dtype = T.float32
 
-    num_tokens = T.dynamic('num_tokens')
-    num_threads = 128
-
-    @T.prim_func
-    def main(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += x[idx] == 2
-
-    # NOTE(chaofan): Ideally, the prover should be able to prove that the access is safe
-    # and the padding value is not used. However, the current prover cannot handle this case.
-    # So for now the expected kernel is a if-else statement to check the boundary.
     @T.prim_func
-    def expected(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += T.Cast("int32",
-                                T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
-
-    return main, expected
-
-
-def vectorize_access_with_atmoic_add_legalize(M: int = 64,
-                                              N: int = 64,
-                                              M_offset: int = 2,
-                                              N_offset: int = 2):
-    dtype = "float32"
-
-    @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -85,21 +56,22 @@ def vectorize_access_with_atmoic_add_legalize(M: int = 64,
                 T.atomic_add(A[tid + M_offset, j + N_offset], 1)
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
                 # Nest if-then-else is expected, do not flatten it to pass structural equal check
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
-                        T.call_extern("handle", "AtomicAdd", A[tid + M_offset, j + N_offset], 1)
+                        T.call_extern("handle", "AtomicAdd", T.address_of(A[tid + M_offset, j + N_offset]), 1)
 
     return main, expected
 
@@ -112,20 +84,24 @@ def assert_vectorize_access_with_atmoic_add(M: int = 64, N: int = 64):
 
 
 def oob_store_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
             for j in T.serial(N):
                 A[tid + M_offset, j + N_offset] = 1
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
-            T.writes(A[tid + M_offset, N_offset:N + N_offset])
+            T.writes(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
@@ -145,13 +121,6 @@ def test_vectorize_access():
     assert_vectorize_access(64, 64)
 
 
-def test_issue_1013():
-    func, expected = issue_1013_buggy_kernel()
-    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
-    transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-    tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
-
-
 def test_vectorize_access_with_atmoic_add():
     assert_vectorize_access_with_atmoic_add(64, 64)
 
diff --git a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
index c95af8777a0ef411b4631f8e44bc0ae50c6213ab..3cc7541ccfb64c33256f8f4a11b9702474c28b33 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
@@ -5,11 +5,13 @@ import tilelang.testing
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64):
-    dtype = "float32"
+    dtype = T.float32
     vec_len = 8
 
     @T.prim_func
-    def main(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def main(
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
@@ -18,7 +20,9 @@ def vectorize_access_legalize(M: int = 64, N: int = 64):
                     A_shared[tid, j, v] = A[tid, j, v]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def expected(
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
diff --git a/testing/python/transform/test_tilelang_transform_let_inline.py b/testing/python/transform/test_tilelang_transform_let_inline.py
index aa2638af10e678e5ec1f1d8c0f52ce4fa76df4af..e773e3feebe37cd5c2a3bc9c3950b0cb33ca59d4 100644
--- a/testing/python/transform/test_tilelang_transform_let_inline.py
+++ b/testing/python/transform/test_tilelang_transform_let_inline.py
@@ -8,14 +8,12 @@ def _check(original, transformed):
     func = original
     mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
     mod = tl.transform.LetInline()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_let_binding():
-
     @T.prim_func
-    def before(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def before(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -24,7 +22,7 @@ def test_let_binding():
                     B[i, j] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def expected(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -34,16 +32,15 @@ def test_let_binding():
 
 
 def test_parallel_scope():
-
     @T.prim_func
-    def before(A: T.Tensor((128,), "float32")):
+    def before(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 value = T.float32(1.0)
                 A[i] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128,), "float32")):
+    def expected(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 A[i] = T.float32(1.0)
diff --git a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
index ca5042e0f4f45f683fa349bfd0005821a84d0fdb..f411b3d5b5779ec5b069deb5c6d9da4d1ba9ee26 100644
--- a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
+++ b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
@@ -24,7 +24,6 @@ def _check(original, transformed):
 
 
 def test_lower_hopper_intrin_barrier():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -37,18 +36,10 @@ def test_lower_hopper_intrin_barrier():
             v_1 = T.launch_thread("threadIdx.x", 128)
             T.evaluate(tir.Call("handle", "tir.create_barriers", [4]))
             with T.If(v_1 == 0), T.Then():
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(0), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(1), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(2), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(3), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(0), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(1), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(2), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(3), 128]))
             T.evaluate(tir.Call("handle", "tir.tvm_storage_sync", ["shared"]))
 
     _check(before, after)
diff --git a/testing/python/transform/test_tilelang_transform_lower_tile_op.py b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
index 07dbd53f1f2162100cfadcea64676f2b9600a5a2..16c7cb8027853b68df9bcb244e09e6613efe4255 100644
--- a/testing/python/transform/test_tilelang_transform_lower_tile_op.py
+++ b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
@@ -8,63 +8,69 @@ import pytest
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, T.float16),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
     def before():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     T.copy(B[k * block_K, bx * block_N], B_shared)
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     def after():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     with tvm.transform.PassContext():
         mod = tvm.tir.transform.BindTarget(auto_target)(before())
diff --git a/testing/python/transform/test_tilelang_transform_make_packed_api.py b/testing/python/transform/test_tilelang_transform_make_packed_api.py
index ff448732601cb38c0c2c4339c6539a5116ed5c94..2508a9d12e4c844152322246fbd5f72c57acf4f2 100644
--- a/testing/python/transform/test_tilelang_transform_make_packed_api.py
+++ b/testing/python/transform/test_tilelang_transform_make_packed_api.py
@@ -80,7 +80,6 @@ def test_target_host_removed():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("cuda", host=host)})
@@ -102,7 +101,6 @@ def test_internal_subroutine_call():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"target": T.target("llvm", host="llvm")})
@@ -121,7 +119,8 @@ def test_internal_subroutine_call():
     subroutine_call_op = compute_scope.body.value.op
     assert isinstance(subroutine_call_op, tvm.ir.GlobalVar), (
         f"The main function's CallNode should use the subroutine's GLobalVar as the operation, "
-        f"but instead has an operation of type {subroutine_call_op}")
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 def test_subroutine_call_to_externally_visible_subroutine():
@@ -135,7 +134,6 @@ def test_subroutine_call_to_externally_visible_subroutine():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("llvm", host="llvm")})
@@ -154,11 +152,10 @@ def test_subroutine_call_to_externally_visible_subroutine():
     assert subroutine_compute_scope is not None
 
     subroutine_call_op = main_compute_scope.body.value.op
-    assert (
-        isinstance(subroutine_call_op, tvm.ir.Op) and
-        subroutine_call_op.name == "tir.tvm_call_cpacked"
-    ), (f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
-        f"but instead has an operation of type {subroutine_call_op}")
+    assert isinstance(subroutine_call_op, tvm.ir.Op) and subroutine_call_op.name == "tir.tvm_call_cpacked", (
+        f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 @tilelang.testing.requires_llvm
@@ -167,10 +164,10 @@ def test_function_call_with_wrong_argument_count():
 
     @T.prim_func
     def func(
-            A: T.Buffer([16, 16], "int32"),
-            B: T.Buffer([16, 16], "int32"),
-            C: T.Buffer([16, 16], "int32"),
-            D: T.Buffer([16, 16], "int32"),
+        A: T.Buffer([16, 16], "int32"),
+        B: T.Buffer([16, 16], "int32"),
+        C: T.Buffer([16, 16], "int32"),
+        D: T.Buffer([16, 16], "int32"),
     ):
         pass
 
diff --git a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
index ddb7f66627f6077daaf8674c8fbe6346b939d532..e85fd8db8d1eff525b7e6ec8c1a0e9698717d445 100644
--- a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
+++ b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
@@ -24,14 +24,13 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
 
 
 def test_multi_version_buffer():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -40,8 +39,8 @@ def test_multi_version_buffer():
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -49,21 +48,27 @@ def test_multi_version_buffer():
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
@@ -73,8 +78,8 @@ def test_multi_version_buffer():
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -82,36 +87,37 @@ def test_multi_version_buffer():
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     _check(before, after)
 
 
 def test_multi_version_buffer_with_let():
-
     @T.prim_func
-    def before(scales: T.Tensor((4,), "float32")):
+    def before(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((8,), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((8,), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
                 value = scales[k]
                 for i in T.serial(8):
@@ -120,10 +126,10 @@ def test_multi_version_buffer_with_let():
                     accum[i] = accum[i] + shared[i]
 
     @T.prim_func
-    def after(scales: T.Tensor((4,), "float32")):
+    def after(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((2, 8), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((2, 8), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
                 value = scales[k]
                 for i in T.serial(8):
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_planning.py b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
index b7448a2045e516e68549cbd9ac33e1cd2884a651..83db7f75cf34e242aca124263f728de8e3b2944f 100644
--- a/testing/python/transform/test_tilelang_transform_pipeline_planning.py
+++ b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
@@ -19,14 +19,12 @@ def _check(original, transformed):
 
 
 def test_simple_pipeline():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def before(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -39,24 +37,22 @@ def test_simple_pipeline():
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def after(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
             for ko in T.serial(
-                    32,
-                    annotations={
-                        "software_pipeline_async_stages": [T.int32(0)],
-                        "software_pipeline_order": [T.int32(0), T.int32(1),
-                                                    T.int32(2)],
-                        "software_pipeline_stage": [T.int32(3), T.int32(3),
-                                                    T.int32(3)]
-                    }):
+                32,
+                annotations={
+                    "software_pipeline_async_stages": [T.int32(0)],
+                    "software_pipeline_order": [T.int32(0), T.int32(1), T.int32(2)],
+                    "software_pipeline_stage": [T.int32(3), T.int32(3), T.int32(3)],
+                },
+            ):
                 T.copy(A[by * 128, ko * 32], A_shared)
                 T.copy(B[ko * 32, bx * 128], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
diff --git a/testing/python/transform/test_tilelang_transform_simplify.py b/testing/python/transform/test_tilelang_transform_simplify.py
index e1f4f9469ac8c5cee11e1578456871c60d0ebb00..3b737682040ab78b91bd1ee40f38b59e60cb4880 100644
--- a/testing/python/transform/test_tilelang_transform_simplify.py
+++ b/testing/python/transform/test_tilelang_transform_simplify.py
@@ -8,14 +8,13 @@ def modify(
     with_B: bool = False,
     with_bias: bool = False,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((64, 64)),
-            B: T.Tensor((64, 64)),
-            C: T.Tensor((64, 64)),
-            D: T.Tensor((64, 64)),
-            bias: T.Tensor((64, 64)),
+        A: T.Tensor((64, 64)),
+        B: T.Tensor((64, 64)),
+        C: T.Tensor((64, 64)),
+        D: T.Tensor((64, 64)),
+        bias: T.Tensor((64, 64)),
     ):
         if with_B:
             if with_bias:
@@ -23,9 +22,9 @@ def modify(
             T.gemm(A, B, D)
         else:
             with T.block():
-                A_shared = T.alloc_shared((64, 64), dtype="float32")
-                C_shared = T.alloc_shared((64, 64), dtype="float32")
-                D_shared = T.alloc_shared((64, 64), dtype="float32")
+                A_shared = T.alloc_shared((64, 64), dtype=T.float32)
+                C_shared = T.alloc_shared((64, 64), dtype=T.float32)
+                D_shared = T.alloc_shared((64, 64), dtype=T.float32)
                 T.copy(A, A_shared)
                 T.copy(C, C_shared)
                 T.gemm(A_shared, C_shared, D_shared)
@@ -41,8 +40,7 @@ def test_modify(with_B=False, with_bias=False):
     assert mod != mod2
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         a: T.handle,
@@ -76,6 +74,7 @@ def test_matmul():
     kernel = tl.compile(mod["main"], out_idx=[2])
 
     import torch
+
     a = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
     b = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
     c = kernel(a, b)
diff --git a/testing/python/transform/test_tilelang_transform_thread_sync.py b/testing/python/transform/test_tilelang_transform_thread_sync.py
index c0b705567262e54779ea61fd43b8d9e3b9dcab44..046ed447a49d0250ff29faac7ecd13c99e6be0ce 100644
--- a/testing/python/transform/test_tilelang_transform_thread_sync.py
+++ b/testing/python/transform/test_tilelang_transform_thread_sync.py
@@ -11,11 +11,7 @@ def run_passes(func: tvm.tir.PrimFunc):
 
     cuda_target = tvm.target.Target("cuda", host="llvm")
 
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({
-        "global_symbol": "test",
-        "target": cuda_target
-    }))(
-        mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({"global_symbol": "test", "target": cuda_target}))(mod)
 
     mod = tvm.tir.transform.AnnotateDeviceRegions()(mod)
     mod = tvm.tir.transform.SplitHostDevice()(mod)
@@ -24,7 +20,6 @@ def run_passes(func: tvm.tir.PrimFunc):
 
 @tilelang.testing.requires_cuda
 def test_sync_if_with_same_index():
-
     @T.prim_func(check_well_formed=False)
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -47,7 +42,6 @@ def test_sync_if_with_same_index():
 
 @tilelang.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
-
     @T.prim_func
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -71,7 +65,6 @@ def test_sync_read_thread_id_independent_location():
 
 @tilelang.testing.requires_cuda
 def test_sync_shared():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 1)
@@ -113,7 +106,6 @@ def test_sync_shared():
 
 @tvm.testing.requires_cuda
 def test_sync_let_stmt():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((16 * 512), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 16)
@@ -136,9 +128,9 @@ def test_sync_let_stmt():
             in_thread_A_temp_1[0] = A_temp
         cross_thread_A_temp_1 = T.Buffer((1,), data=cross_thread_A_temp, scope="local")
         with T.attr(
-                T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
-                "reduce_scope",
-                T.reinterpret("handle", T.uint64(0)),
+            T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
+            "reduce_scope",
+            T.reinterpret("handle", T.uint64(0)),
         ):
             T.tvm_thread_allreduce(
                 T.uint32(1),
@@ -190,16 +182,19 @@ def test_sync_let_stmt():
 
 @tilelang.testing.requires_cuda
 def test_sync_shared_dyn_stmatrix_loop_hoist():
-
     @T.prim_func
     def func():
         buf_dyn_shmem = T.alloc_buffer((98304,), "uint8", scope="shared.dyn")
         tx = T.launch_thread("threadIdx.x", 384)
         for i in T.unroll(8):
             off = (
-                i // 4 * 8192 + tx // 32 * 1024 + tx % 16 * 64 +
-                (tx % 8 // 4 + i % 4 // 2) % 2 * 32 + (tx % 4 // 2 + i % 2) % 2 * 16 +
-                (tx % 32 // 16 + tx % 2) % 2 * 8)
+                i // 4 * 8192
+                + tx // 32 * 1024
+                + tx % 16 * 64
+                + (tx % 8 // 4 + i % 4 // 2) % 2 * 32
+                + (tx % 4 // 2 + i % 2) % 2 * 16
+                + (tx % 32 // 16 + tx % 2) % 2 * 8
+            )
             T.evaluate(
                 T.call_intrin(
                     "handle",
@@ -214,7 +209,8 @@ def test_sync_shared_dyn_stmatrix_loop_hoist():
                         2,
                     ),
                     T.int32(2),
-                ))
+                )
+            )
 
     mod = tvm.IRModule({"main": func})
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
diff --git a/testing/python/transform/test_tilelang_transform_warp_specialized.py b/testing/python/transform/test_tilelang_transform_warp_specialized.py
index 063ae2940c6c8cfbda125e059afc0e3904da08d8..0171fab82ce0dba7be8aa7ba342d213018d7d40b 100644
--- a/testing/python/transform/test_tilelang_transform_warp_specialized.py
+++ b/testing/python/transform/test_tilelang_transform_warp_specialized.py
@@ -25,14 +25,13 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
 
 
 def test_warp_specialized():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -41,39 +40,41 @@ def test_warp_specialized():
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 256)
-        A_shared = T.decl_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-        B_shared = T.decl_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+        A_shared = T.decl_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+        B_shared = T.decl_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
         C_local = T.decl_buffer((32,), scope="local")
         T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
         T.attr([128, 128], "kWarpSpecializationScope", 0)
@@ -85,34 +86,35 @@ def test_warp_specialized():
                     T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        T.get_mbarrier(k % 3),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v - 128 == 0:
                     T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        T.get_mbarrier(k % 3),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
         else:
             T.set_max_nreg(240, 1)
             for k in range(16):
                 T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
+                T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
     _check(before, after)
 
diff --git a/testing/python/utils/test_compress_utils.py b/testing/python/utils/test_compress_utils.py
index 1ec4cace8e242b34a7abf7b22c1a4fe1891c4fad..e8fc20539eb9bf89a90d7645d9d5d31f6e3f6428 100644
--- a/testing/python/utils/test_compress_utils.py
+++ b/testing/python/utils/test_compress_utils.py
@@ -6,7 +6,7 @@ from tilelang.utils.sparse import compress_sm90, randn_semi_sparse
 
 
 def _test_compress_sm90(M, K, block_k, dtype):
-    A = randn_semi_sparse(M, K, dtype=dtype, device='cuda')
+    A = randn_semi_sparse(M, K, dtype=dtype, device="cuda")
     A_sparse, E = compress_sm90(A, block_k, False)
 
 
diff --git a/testing/python/webgpu/test_webgpu_codegen.py b/testing/python/webgpu/test_webgpu_codegen.py
index 0fe4f196d60631837c429a8dcf0f3752a84e2a31..b8b199e79d036eea04212ad523c02ed2e7cbcf2c 100644
--- a/testing/python/webgpu/test_webgpu_codegen.py
+++ b/testing/python/webgpu/test_webgpu_codegen.py
@@ -4,13 +4,12 @@ import tilelang.testing
 import tilelang.language as T
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -39,8 +38,8 @@ def assert_gemm_codegen(
     block_M,
     block_N,
     block_K,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     func = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
     # Because the current pass context have been polluted by previous testing.
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index e4be01290a592d603003b1a10b763e3809396b7c..87176b2091b930408bcb81ffd922004919564e24 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -23,6 +23,7 @@ def _compute_version() -> str:
         if version_file.is_file():
             try:
                 from version_provider import dynamic_metadata  # type: ignore
+
                 return dynamic_metadata("version")
             except Exception:
                 # Fall back to the raw VERSION file if provider isn't available.
@@ -33,6 +34,7 @@ def _compute_version() -> str:
 
     try:
         from importlib.metadata import version as _dist_version  # py3.8+
+
         return _dist_version("tilelang")
     except Exception as exc:
         warnings.warn(
@@ -120,7 +122,7 @@ def _load_tile_lang_lib():
 if env.SKIP_LOADING_TILELANG_SO == "0":
     _LIB, _LIB_PATH = _load_tile_lang_lib()
 
-from .jit import jit, JITKernel, compile  # noqa: F401
+from .jit import jit, lazy_jit, JITKernel, compile, par_compile  # noqa: F401
 from .profiler import Profiler  # noqa: F401
 from .cache import clear_cache  # noqa: F401
 
@@ -133,10 +135,13 @@ from .layout import (
     Fragment,  # noqa: F401
 )
 from . import (
+    analysis,  # noqa: F401
     transform,  # noqa: F401
     language,  # noqa: F401
     engine,  # noqa: F401
+    tools,  # noqa: F401
 )
+from .language.v2 import dtypes  # noqa: F401
 from .autotuner import autotune  # noqa: F401
 from .transform import PassConfigKey  # noqa: F401
 
diff --git a/tilelang/analysis/__init__.py b/tilelang/analysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e4090d80e7c6405a4cc7c27241430706ea96bd1
--- /dev/null
+++ b/tilelang/analysis/__init__.py
@@ -0,0 +1,6 @@
+"""Tilelang IR analysis & visitors."""
+
+from .ast_printer import ASTPrinter  # noqa: F401
+from .nested_loop_checker import NestedLoopChecker  # noqa: F401
+from .fragment_loop_checker import FragmentLoopChecker  # noqa: F401
+from .layout_visual import LayoutVisual  # noqa: F401
diff --git a/tilelang/analysis/ast_printer.py b/tilelang/analysis/ast_printer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e634e02713e4a277940aa487dab2a10d45151dd0
--- /dev/null
+++ b/tilelang/analysis/ast_printer.py
@@ -0,0 +1,23 @@
+from tvm import tir
+from tvm.tir import PrimFunc
+from tvm.tir.transform import prim_func_pass
+from tvm.tir.stmt_functor import ir_transform
+
+
+def ASTPrinter():
+    """
+    Print the AST of a given tilelang module for debugging.
+    """
+
+    def pre_visit(statement: tir.Stmt) -> None:
+        """
+        Pre-order visitor to print all visited statements.
+        """
+
+        print(f"Visiting statement: {type(statement)}, {statement}")
+
+    def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
+        new_body = ir_transform(func.body, pre_visit, None)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/fragment_loop_checker.py b/tilelang/analysis/fragment_loop_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..94900a5cc6c9cc167e539a47c6bf4d3e70b5db21
--- /dev/null
+++ b/tilelang/analysis/fragment_loop_checker.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm
+from tvm.tir.transform import prim_func_pass
+from tvm.tir.stmt_functor import post_order_visit
+
+
+@tir.functor.visitor
+class _LoopVarUseAnalyzer(PyStmtExprVisitor):
+    """Analyze whether a loop variable is used in the given expr."""
+
+    def __init__(self, var: Var) -> None:
+        super().__init__()
+        self.var = var
+        self.used = False
+
+    def visit_var_(self, op: Var) -> None:
+        if op == self.var:
+            self.used = True
+        # Don't recursively visit children to avoid infinite recursion
+
+
+def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
+    """
+    Collect local buffer accesses in the loop body.
+
+    Args:
+        statement: The TIR statement to analyze
+
+    Returns:
+        Tuple of buffer accesses in the loop body.
+    """
+
+    buffer_accesses = []
+
+    def visit_buffer_access(node):
+        if isinstance(node, (BufferLoad, BufferStore)) and node.buffer.scope().startswith("local"):
+            buffer_accesses.append(node)
+
+    post_order_visit(statement, visit_buffer_access)
+
+    return buffer_accesses
+
+
+@tir.functor.visitor
+class _FragmentLoopCheckVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            # Fuse consecutive parallel loops
+            # Other nested cases are all invalid in TileLang.
+            loops = [op]
+            child = op.body
+            while isinstance(child, For) and child.kind == tir.ForKind.PARALLEL:
+                loops.append(child)
+                child = child.body
+
+            loops_with_symbolic_ranges = []
+            for loop in loops:
+                if not (isinstance(loop.min, IntImm) and isinstance(loop.extent, IntImm)):
+                    loops_with_symbolic_ranges.append(loop)
+
+            if len(loops_with_symbolic_ranges) > 0:
+                buffer_accesses = collect_local_buffer_accesses(child)
+            for loop in loops_with_symbolic_ranges:
+                for buffer_access in buffer_accesses:
+                    indices = buffer_access.indices
+                    analyzer = _LoopVarUseAnalyzer(loop.loop_var)
+                    for index in indices:
+                        analyzer.visit_expr(index)
+                    if analyzer.used:
+                        raise ValueError(
+                            "[Tilelang Semantic Check] "
+                            f"Loop variable {loop.loop_var} in a T.Parallel loop with symbolic range (min={loop.min}, extent={loop.extent}) is used to index "
+                            "a local/fragment buffer, which is not allowed in Tilelang."
+                        )
+
+            return
+
+        self.visit_stmt(op.body)
+
+
+def FragmentLoopChecker():
+    """
+    When using T.Parallel over a local/fragment buffer, there are several restrictions:
+    to ensure that the parallelization is valid.
+
+    1. The range of loop can not be symbolic.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _FragmentLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/layout_visual.py b/tilelang/analysis/layout_visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..141fb808c49f978fb2fe3089e5903f814bda30fc
--- /dev/null
+++ b/tilelang/analysis/layout_visual.py
@@ -0,0 +1,86 @@
+import tilelang.language as T
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor
+
+from tvm.tir.transform import prim_func_pass
+from tilelang.tools.plot_layout import plot_layout
+
+
+def print_fragment_format(layout: T.Fragment) -> str:
+    """
+    Format fragment layout information into a human-readable string.
+
+    Parameters
+    ----------
+    layout : T.Fragment
+        The fragment layout to format
+
+    Returns
+    -------
+    str
+        Formatted string showing shape, thread mapping, and index mapping
+    """
+    if isinstance(layout, T.Fragment):
+        input_shape = layout.get_input_shape()
+        output_shape = layout.get_output_shape()
+        lines = [f"  Shape: {input_shape} -> {output_shape}", f"  Thread: {layout.forward_thread}", f"  Index:  {layout.forward_index}"]
+        print("\n".join(lines))
+    else:
+        raise ValueError(f"Expected T.Fragment, but got {type(layout).__name__}")
+
+
+@tir.functor.visitor
+class _LayoutVisualVisitor(PyStmtExprVisitor):
+    """
+    User-friendly pass which visualizes fragment layouts inferred during compilation.
+
+    In TileLang, Fragment layouts describe:
+    - How logical indices (e.g., [i, j]) map to thread IDs
+    - How logical indices map to register file locations within each thread
+    - The shape transformation from input dimensions to output dimensions
+
+    This pass generates two types of output:
+    1. Textual output: A human-readable description printed to console
+    2. Visual diagrams: Color-coded plots saved to files (PDF, PNG, SVG formats)
+
+    Configuration:
+    The pass is controlled by the TL_ENABLE_LAYOUT_VISUALIZATION configuration option.
+    The configuration accepts string values:
+
+    - Empty string or not set: Pass does nothing (default, disabled)
+    - "png": Generate PNG format only (recommended for quick inspection)
+    - "pdf": Generate PDF format only (recommended for documentation)
+    - "svg": Generate SVG format only (recommended for web/vector graphics)
+    - "all": Generate all formats (PDF, PNG, SVG)
+    - "png,svg": Generate multiple formats (comma-separated)
+    """
+
+    def __init__(self, formats: list[str] = ""):
+        super().__init__()
+        self.layout_found = []
+        self.processed_layouts = set()
+        self.formats_list = [f for f in formats if f != "txt"]
+
+    def visit_block_(self, op: tir.Block) -> None:
+        if "layout_map" in op.annotations:
+            layout_map = op.annotations["layout_map"]
+
+            for key, layout in layout_map.items():
+                if isinstance(layout, T.Fragment):
+                    layout_id = str(layout)
+                    if layout_id not in self.processed_layouts:
+                        print(f"{key} inferenced layout:")
+                        print_fragment_format(layout)
+                        for fmt in self.formats_list:
+                            plot_layout(layout, name=f"{key}_layout", formats=fmt)
+                        self.processed_layouts.add(layout_id)
+
+        # super().visit_block_(op)
+
+
+def LayoutVisual(formats: str = ""):
+    def pass_fn(func: tir.PrimFunc, mod, ctx):
+        _LayoutVisualVisitor(formats=formats).visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/nested_loop_checker.py b/tilelang/analysis/nested_loop_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..51da7f4c8ea31c8957e79c366ab7981bd67102af
--- /dev/null
+++ b/tilelang/analysis/nested_loop_checker.py
@@ -0,0 +1,119 @@
+from tvm import tir
+from tvm.tir import (
+    For,
+    Call,
+    PrimFunc,
+    PyStmtExprVisitor,
+)
+from tvm.tir.transform import prim_func_pass
+
+
+def is_pipelined_for(op: For) -> bool:
+    """Check if a for loop is pipelined."""
+
+    anno_keys = ["num_stages", "tl_pipeline_order", "tl_pipeline_stage", "tl_pipeline_sync", "tl_pipeline_group"]
+    return any(key in op.annotations for key in anno_keys)
+
+
+def is_tile_op(op: Call) -> bool:
+    """Check if a call is a tile-op"""
+
+    return op.op.get_attr("TLOpBuilder") is not None
+
+
+@tir.functor.visitor
+class _NestedLoopCheckVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.in_parallel_context = False
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            child = op.body
+
+            # Special case: continuous nested parallel loop is allowed.
+            if isinstance(child, tir.For) and child.kind == tir.ForKind.PARALLEL:
+                self.visit_stmt(child)
+                return
+
+            # Otherwise
+            if self.in_parallel_context:
+                raise ValueError("[Tilelang Semantic Check] Nested parallel loops are not allowed. Please check your loop structure.")
+            self.in_parallel_context = True
+            super().visit_for_(op)
+            self.in_parallel_context = False
+            return
+        elif is_pipelined_for(op):
+            if self.in_parallel_context:
+                raise ValueError(
+                    "[Tilelang Semantic Check] Pipelined loop cannot be nested inside a parallel loop. Please check your loop structure."
+                )
+
+        super().visit_for_(op)
+
+    def visit_call_(self, op: Call) -> None:
+        if self.in_parallel_context and is_tile_op(op):
+            raise ValueError(
+                f'[Tilelang Semantic Check] Only elementwise operations are allowed inside a parallel loop. Got a tile-op "{op.op}".'
+            )
+
+
+def NestedLoopChecker():
+    """
+    User-friendly pass which identifies any invalid any nested-loop pattern.
+
+    Nested loops is an annoying problem in tilelang or other polyhedral-style compilers.
+    It contains many corner cases and undefined behaviours.
+
+    In tilelang, there are four loops:
+        T.serial
+        T.Parallel (T.vectorized)
+        T.Pipelined
+        T.Persistent
+
+    T.Persistent is a new feature which we do not consider here.
+
+    We define the following rules:
+    - (Rule 1) T.serial can be nested inside any other loop type without restriction.
+    - (Rule 2) Consecutive T.Parallel nested loops are not allowed. Including any TileOp (T.copy, etc.) which has
+        "parallel" behaviours is also forbidden.
+
+        Examples:
+        for i in T.Parallel(M):
+            stmt
+            for j in T.Parallel(N):
+                ...
+
+        for i in T.Parallel(M):
+            T.copy(A, B) # forbidden!
+
+        **Only a special case is allowed: strict continuous Parallel loops.** Since we can fuse them into a single T.Parallel loop.
+        Example:
+
+        for i in T.Parallel(M):
+                for j in T.Parallel(N):
+                    ... # allowed
+    - (Rule 3) T.Pipelined inside a T.Parallel is forbidden.
+
+        Examples:
+            for i in T.Parallel(M):
+                for j in T.Pipelined(K): # forbidden!
+                    ...
+
+            for i in T.Pipelined(K):
+                for j in T.Parallel(N): # allowed, ok
+                    ...
+
+    In summary, the problem mainly lies in the "T.Parallel". We highly recommend to use
+    T.Parallel to implement a tiled operator inside a kernel (e.g. T.gemm level) instead of other usages.
+    This guideline can help you avoid most of the issues.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _NestedLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/autotuner/capture.py b/tilelang/autotuner/capture.py
index 27c24f14eecc5846af5889c2d451326552c007d8..428a6da9047bea8d517dbc16dd5e403bc5c29d48 100644
--- a/tilelang/autotuner/capture.py
+++ b/tilelang/autotuner/capture.py
@@ -85,8 +85,7 @@ def _get_current_stack() -> CaptureStack:
 
 
 class AutotuneInputsCapture:
-
-    __slots__ = ("tensors")
+    __slots__ = "tensors"
 
     def __init__(self, tensors: list[Any]):
         self.tensors = tensors
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index b93c4448e3f077755bff0e63f8edaf1fba91f99c..69ad49c79d91127c7676e904fd256f2f8f56f659 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -1,5 +1,5 @@
-"""The auto-tune parameters.
-"""
+"""The auto-tune parameters."""
+
 from __future__ import annotations
 
 import tilelang
@@ -13,18 +13,25 @@ from pathlib import Path
 from tilelang.jit import JITKernel
 import cloudpickle
 import os
-import shutil
 from tilelang.engine.param import KernelParam
 from tilelang import logger
 import json
 import hashlib
+import uuid
+from tilelang import env
+from tvm.runtime import Executable
 
 BEST_CONFIG_PATH = "best_config.json"
 FUNCTION_PATH = "function.pkl"
 LATENCY_PATH = "latency.json"
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
+
+# Align file names with cache/kernel_cache.py
+DEVICE_KERNEL_PATH = "device_kernel.cu"
+HOST_KERNEL_PATH = "host_kernel.cu"
+EXECUTABLE_PATH = "executable.so"
 KERNEL_LIB_PATH = "kernel_lib.so"
+KERNEL_CUBIN_PATH = "kernel.cubin"
+KERNEL_PY_PATH = "kernel.py"
 PARAMS_PATH = "params.pkl"
 
 
@@ -33,7 +40,7 @@ class CompileArgs:
     """Compile arguments for the auto-tuner. Detailed description can be found in `tilelang.jit.compile`.
     Attributes:
         out_idx: List of output tensor indices.
-        execution_backend: Execution backend to use for kernel execution (default: "cython").
+        execution_backend: Execution backend to use for kernel execution (default: "auto").
         target: Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host: Target host for cross-compilation (default: None).
         verbose: Whether to enable verbose output (default: False).
@@ -42,8 +49,8 @@ class CompileArgs:
     """
 
     out_idx: list[int] | int | None = None
-    execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython"
-    target: Literal['auto', 'cuda', 'hip'] = 'auto'
+    execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto"
+    target: Literal["auto", "cuda", "hip"] = "auto"
     target_host: str | Target = None
     verbose: bool = False
     pass_configs: dict[str, Any] | None = None
@@ -55,24 +62,20 @@ class CompileArgs:
             target=self.target,
             target_host=self.target_host,
             verbose=self.verbose,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
 
     def __hash__(self):
         data = {
-            "execution_backend":
-                self.execution_backend,
-            "target":
-                str(self.target),
-            "target_host":
-                str(self.target_host) if self.target_host else None,
-            "verbose":
-                self.verbose,
-            "pass_configs":
-                json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
+            "execution_backend": self.execution_backend,
+            "target": str(self.target),
+            "target_host": str(self.target_host) if self.target_host else None,
+            "verbose": self.verbose,
+            "pass_configs": json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
         }
 
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -97,6 +100,7 @@ class ProfileArgs:
         manual_check_prog: Callable = None
         cache_input_tensors: bool = True
     """
+
     warmup: int = 25
     rep: int = 100
     timeout: int = 30
@@ -120,8 +124,8 @@ class ProfileArgs:
             "atol": self.atol,
             "max_mismatched_ratio": self.max_mismatched_ratio,
         }
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -136,6 +140,7 @@ class AutotuneResult:
         func: Optimized function.
         kernel: Compiled kernel function.
     """
+
     latency: float | None = None
     config: dict | None = None
     ref_latency: float | None = None
@@ -143,6 +148,31 @@ class AutotuneResult:
     func: Callable | None = None
     kernel: Callable | None = None
 
+    @staticmethod
+    def _load_binary(path: str):
+        with open(path, "rb") as file:
+            binary = file.read()
+        return binary
+
+    @staticmethod
+    def _safe_write_file(path: str, mode: str, operation: Callable[[Any], None]):
+        # Random a temporary file within the same FS as the cache directory
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}")
+        with open(temp_path, mode) as temp_file:
+            operation(temp_file)
+        # Use atomic POSIX replace, so other processes cannot see a partial write
+        os.replace(temp_path, path)
+
+    @staticmethod
+    def _safe_write_executable(executable: Executable, path: str):
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path)
+        os.replace(temp_path, path)
+
     def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
@@ -161,34 +191,62 @@ class AutotuneResult:
         """
         os.makedirs(cache_path, exist_ok=True)  # Ensure directory exists
 
-        # Save kernel source code
+        # Save device kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
+            device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving kernel source code to file: {kernel_path}")
+                logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
             if kernel.kernel_source is not None:
-                with open(kernel_path, "w") as f:
-                    f.write(kernel.kernel_source)
+                self._safe_write_file(device_kernel_path, "w", lambda f: f.write(kernel.kernel_source))
         except Exception as e:
             logger.error(f"Error saving kernel source code to disk: {e}")
 
-        # Save wrapped kernel source code
+        # Save host kernel source code (wrapped)
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path, "w") as f:
-                f.write(kernel.get_kernel_source())
+                logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+            # Match kernel_cache behavior: use host source for tvm_ffi, otherwise wrapped kernel
+            if kernel.execution_backend == "tvm_ffi":
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_host_source()))
+            else:
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_kernel_source()))
         except Exception as e:
             logger.error(f"Error saving wrapped kernel source code to disk: {e}")
 
-        # Save kernel library
+        # Save kernel library (backend-specific)
         try:
-            kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            shutil.copy(src_lib_path, kernel_lib_path)
+            if kernel.execution_backend == "nvrtc":
+                kernel_lib_file = KERNEL_CUBIN_PATH
+            elif kernel.execution_backend == "tvm_ffi":
+                kernel_lib_file = EXECUTABLE_PATH
+            else:
+                kernel_lib_file = KERNEL_LIB_PATH
+
+            kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+
+            if kernel.execution_backend == "nvrtc":
+                # Save cubin and python helper file
+                src_lib_path = kernel.adapter.libpath
+                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
+                py_src_path = src_lib_path.replace(".cubin", ".py")
+                if verbose:
+                    logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+                self._safe_write_file(kernel_py_path, "wb", lambda f: f.write(self._load_binary(py_src_path)))
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+            elif kernel.execution_backend == "tvm_ffi":
+                executable = kernel.adapter.executable
+                if verbose:
+                    logger.debug(f"Saving kernel executable to file: {kernel_lib_path}")
+                self._safe_write_executable(executable, kernel_lib_path)
+            else:
+                src_lib_path = kernel.adapter.libpath
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+
         except Exception as e:
             logger.error(f"Error saving kernel library to disk: {e}")
 
@@ -197,8 +255,7 @@ class AutotuneResult:
             params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            with open(params_path, "wb") as f:
-                cloudpickle.dump(kernel.params, f)
+            self._safe_write_file(params_path, "wb", lambda f: cloudpickle.dump(kernel.params, f))
         except Exception as e:
             logger.error(f"Error saving kernel parameters to disk: {e}")
 
@@ -208,8 +265,9 @@ class AutotuneResult:
         target: str | Target = "auto",
         target_host: str | Target = None,
         out_idx: list[int] | int | None = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
         pass_configs: dict = None,
+        compile_flags: list[str] | str | None = None,
         func: Callable = None,
         verbose: bool = False,
     ) -> JITKernel:
@@ -233,23 +291,46 @@ class AutotuneResult:
         if not os.path.exists(cache_path):
             return None
 
-        kernel_global_source: str | None = None
+        # Resolve backend to pick correct file names
+        if execution_backend == "nvrtc":
+            kernel_lib_file = KERNEL_CUBIN_PATH
+        elif execution_backend == "tvm_ffi":
+            kernel_lib_file = EXECUTABLE_PATH
+        else:
+            kernel_lib_file = KERNEL_LIB_PATH
+
+        device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+        host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+        kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+        params_path = os.path.join(cache_path, PARAMS_PATH)
+
+        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
+            return None
+
+        device_kernel_source: str | None = None
+        host_kernel_source: str | None = None
         kernel_params: list[KernelParam] | None = None
 
+        # Load optional device kernel source
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
+                logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+            with open(device_kernel_path) as f:
+                device_kernel_source = f.read()
         except Exception as e:
-            logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+            logger.error(f"Error loading kernel source code from disk: {e}")
 
-        kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
+        # Load optional host kernel source
+        try:
+            if verbose:
+                logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+            with open(host_kernel_path) as f:
+                host_kernel_source = f.read()
+        except Exception as e:
+            logger.error(f"Error loading host kernel source code from disk: {e}")
 
         # Load kernel parameters
         try:
-            params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
@@ -257,10 +338,11 @@ class AutotuneResult:
         except Exception as e:
             logger.error(f"Error loading kernel parameters from disk: {e}")
 
-        if kernel_global_source and kernel_params:
+        if host_kernel_source and device_kernel_source and kernel_params:
             return JITKernel.from_database(
                 func=func,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 params=kernel_params,
                 target=target,
@@ -268,6 +350,7 @@ class AutotuneResult:
                 out_idx=out_idx,
                 execution_backend=execution_backend,
                 pass_configs=pass_configs,
+                compile_flags=compile_flags,
             )
         else:
             return None
@@ -276,26 +359,30 @@ class AutotuneResult:
         if not os.path.exists(path):
             os.makedirs(path)
 
-        # save best config
+        # save best config (atomic)
         if verbose:
             logger.debug(f"Saving best config to file: {path / BEST_CONFIG_PATH}")
-        with open(path / BEST_CONFIG_PATH, "w") as f:
-            json.dump(self.config, f)
+        self._safe_write_file(str(path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
 
-        # save function
+        # save function (atomic)
         if verbose:
             logger.debug(f"Saving function to file: {path / FUNCTION_PATH}")
-        with open(path / FUNCTION_PATH, "wb") as f:
-            cloudpickle.dump(self.func, f)
+        self._safe_write_file(str(path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
 
-        # save ref latency
+        # save ref latency (atomic)
         if verbose:
             logger.debug(f"Saving latency to file: {path / LATENCY_PATH}")
-        with open(path / LATENCY_PATH, "w") as f:
-            json.dump({
-                "latency": self.latency,
-                "ref_latency": self.ref_latency,
-            }, f)
+        self._safe_write_file(
+            str(path / LATENCY_PATH),
+            "w",
+            lambda f: json.dump(
+                {
+                    "latency": self.latency,
+                    "ref_latency": self.ref_latency,
+                },
+                f,
+            ),
+        )
 
         # save kernel
         self._save_kernel_to_disk(path, self.kernel)
@@ -306,6 +393,13 @@ class AutotuneResult:
             return None
 
         verbose = compile_args.verbose
+        # Normalize target and resolve execution backend for loading
+        from tilelang.utils.target import determine_target as _determine_target
+        from tilelang.jit.execution_backend import resolve_execution_backend
+
+        norm_target = Target(_determine_target(compile_args.target)) if isinstance(compile_args.target, str) else compile_args.target
+        requested_backend = compile_args.execution_backend
+        resolved_backend = resolve_execution_backend(requested_backend, norm_target)
         # load best config
         if verbose:
             logger.debug(f"Loading best config from file: {path / BEST_CONFIG_PATH}")
@@ -325,10 +419,17 @@ class AutotuneResult:
             latency = json.load(f)
             latency, ref_latency = latency["latency"], latency["ref_latency"]
 
-        kernel = cls._load_kernel_from_disk(cls, path, compile_args.target,
-                                            compile_args.target_host, compile_args.out_idx,
-                                            compile_args.execution_backend,
-                                            compile_args.pass_configs, func)
+        kernel = cls._load_kernel_from_disk(
+            cls,
+            path,
+            norm_target,
+            compile_args.target_host,
+            compile_args.out_idx,
+            resolved_backend,
+            compile_args.pass_configs,
+            None,  # compile_flags not tracked here
+            func,
+        )
         if kernel is None:
             return None
         kernel.update_tuner_result(
diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 4027c619722d136f7051b534d71ab4a9ff4fefc2..8d95037395b1af5f6ede1149a952b5a1d7c460ef 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -3,6 +3,7 @@
 This module provides functionality for auto-tuning tilelang programs, including JIT compilation
 and performance optimization through configuration search.
 """
+
 from __future__ import annotations
 from dataclasses import dataclass
 
@@ -14,7 +15,8 @@ from tvm.tir import PrimFunc, Var
 from tvm.target import Target
 import inspect
 from functools import partial
-from typing import (Callable, Generic, Literal, Any, TypeVar)
+from typing import Callable, Generic, Literal, Any, TypeVar
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
@@ -35,6 +37,7 @@ from pathlib import Path
 
 from tilelang import env
 from tilelang.autotuner.param import CompileArgs, ProfileArgs, AutotuneResult
+from tilelang.utils.language import get_prim_func_name
 from tilelang.autotuner.capture import get_autotune_inputs
 from tilelang.utils.target import determine_target
 from tilelang import __version__
@@ -74,8 +77,8 @@ def _init_logger_handlers():
     global _logger_handlers_initialized
     if _logger_handlers_initialized:
         return
-    formatter = logging.Formatter('%(asctime)s %(levelname)s:%(message)s')
-    file_handler = logging.FileHandler('autotuner.log', mode='w')
+    formatter = logging.Formatter("%(asctime)s %(levelname)s:%(message)s")
+    file_handler = logging.FileHandler("autotuner.log", mode="w")
     file_handler.setLevel(logging.DEBUG)
     file_handler.setFormatter(formatter)
     console_handler = logging.StreamHandler(sys.stdout)
@@ -87,8 +90,7 @@ def _init_logger_handlers():
 
 
 def get_available_cpu_count() -> int:
-    """Gets the number of CPU cores available to the current process.
-    """
+    """Gets the number of CPU cores available to the current process."""
     try:
         cpu_count = len(os.sched_getaffinity(0))
     except AttributeError:
@@ -107,6 +109,7 @@ class AutoTuner:
         fn: The function to be auto-tuned.
         configs: List of configurations to try during auto-tuning.
     """
+
     compile_args = CompileArgs()
     profile_args = ProfileArgs()
 
@@ -137,13 +140,15 @@ class AutoTuner:
         """
         return cls(kernel, configs)
 
-    def set_compile_args(self,
-                         out_idx: list[int] | int | None = None,
-                         target: Literal['auto', 'cuda', 'hip'] = 'auto',
-                         execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
-                         target_host: str | Target = None,
-                         verbose: bool = False,
-                         pass_configs: dict[str, Any] | None = None):
+    def set_compile_args(
+        self,
+        out_idx: list[int] | int | None = None,
+        target: Literal["auto", "cuda", "hip", "metal"] = "auto",
+        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
+        target_host: str | Target = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         """Set compilation arguments for the auto-tuner.
 
         Args:
@@ -157,29 +162,38 @@ class AutoTuner:
         Returns:
             AutoTuner: Self for method chaining.
         """
+        # Normalize target to a concrete TVM Target and resolve execution backend
+        t = Target(determine_target(target))
+        from tilelang.jit.execution_backend import resolve_execution_backend
+
+        resolved_backend = resolve_execution_backend(execution_backend, t)
+
         self.compile_args = CompileArgs(
             out_idx=out_idx,
-            target=Target(determine_target(target)),
-            execution_backend=execution_backend,
+            target=t,
+            execution_backend=resolved_backend,
             target_host=target_host,
             verbose=verbose,
-            pass_configs=pass_configs)
+            pass_configs=pass_configs,
+        )
 
         return self
 
-    def set_profile_args(self,
-                         warmup: int = 25,
-                         rep: int = 100,
-                         timeout: int = 30,
-                         supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
-                         ref_prog: Callable = None,
-                         supply_prog: Callable = None,
-                         rtol: float = 1e-2,
-                         atol: float = 1e-2,
-                         max_mismatched_ratio: float = 0.01,
-                         skip_check: bool = False,
-                         manual_check_prog: Callable = None,
-                         cache_input_tensors: bool = False):
+    def set_profile_args(
+        self,
+        warmup: int = 25,
+        rep: int = 100,
+        timeout: int = 30,
+        supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
+        ref_prog: Callable = None,
+        supply_prog: Callable = None,
+        rtol: float = 1e-2,
+        atol: float = 1e-2,
+        max_mismatched_ratio: float = 0.01,
+        skip_check: bool = False,
+        manual_check_prog: Callable = None,
+        cache_input_tensors: bool = False,
+    ):
         """Set profiling arguments for the auto-tuner.
 
         Args:
@@ -203,9 +217,7 @@ class AutoTuner:
         # the `supply_prog` will be ignored and the `get_autotune_inputs` will be used instead.
         if get_autotune_inputs() is not None:
             if supply_prog is not None:
-                logger.warning(
-                    "`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context."
-                )
+                logger.warning("`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context.")
             supply_prog = lambda _: get_autotune_inputs()  # noqa: E731
 
         self.profile_args = ProfileArgs(
@@ -220,13 +232,13 @@ class AutoTuner:
             cache_input_tensors=cache_input_tensors,
             warmup=warmup,
             rep=rep,
-            timeout=timeout)
+            timeout=timeout,
+        )
 
         # If a custom `supply_prog` is provided, the profiler's `supply_type` setting
         # becomes ineffective. The custom supply program will be used instead.
         if supply_prog is not None and supply_type != tilelang.TensorSupplyType.Auto:
-            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because "
-                           "`supply_prog` is not None.")
+            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because `supply_prog` is not None.")
 
         return self
 
@@ -235,9 +247,8 @@ class AutoTuner:
         self._kernel_parameters = k_parameters
         self._function_parameters = f_parameters
 
-    def generate_cache_key(self, parameters: dict[str, Any]) -> AutotuneResult | None:
-        """Generate a cache key for the auto-tuning process.
-        """
+    def generate_cache_key(self, parameters: dict[str, Any], extra_parameters: dict[str, Any]) -> AutotuneResult | None:
+        """Generate a cache key for the auto-tuning process."""
 
         def _normalize_param(value):
             if isinstance(value, Var):
@@ -261,6 +272,7 @@ class AutoTuner:
         key_data = {
             "version": __version__,
             "op_parameters": tuple(op_parameters),
+            "extra_parameters": extra_parameters,
             "func_source": func_source,
             "configs": self.configs,
             "compile_args": hash(self.compile_args),
@@ -293,18 +305,43 @@ class AutoTuner:
         sig = inspect.signature(self.fn)
         parameters = sig.parameters
 
+        # NOTE(chaofan):  We need to extract some parameters from the closure.
+        # Consider the case:
+        #   def gemm(M, N, K):
+        #       def kernel(...)
+        # If we only extract source, M/N/K will be symbolic and there will be cache problem.
+        extra_parameters: dict[str, Any] = {}
+        cells = self.fn.__closure__
+        var_names = self.fn.__code__.co_freevars
+        if cells is not None:
+            assert len(var_names) == len(cells), "Number of free variables does not match"
+            for var_name, cell in zip(var_names, cells):
+                if var_name in parameters:
+                    continue
+                # Cell content must be serializable
+                assert isinstance(cell.cell_contents, (int, float, str, bool, type(None))), (
+                    f"Cell contents {cell.cell_contents} is not serializable: {type(cell.cell_contents)}"
+                )
+                extra_parameters[var_name] = cell.cell_contents
+
         if isinstance(self.configs, Callable):
             self.configs = self.configs(*self._kernel_parameters)
 
-        key = self.generate_cache_key(parameters)
+        key = self.generate_cache_key(parameters, extra_parameters)
 
         with self._lock:
-            if env.is_cache_enabled():
+            if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                 # First check in-memory cache
                 if key in self._memory_cache:
-                    logger.warning("Found kernel in memory cache. For better performance," \
-                                        " consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.")
-                    return self._memory_cache[key]
+                    # Include PrimFunc name when hitting autotuner memory cache
+                    cached_result = self._memory_cache[key]
+                    prim = getattr(cached_result, "func", None)
+                    kernel_name = get_prim_func_name(prim, "<unknown>")
+                    logger.warning(
+                        "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.",
+                        kernel_name,
+                    )
+                    return cached_result
 
                 # Then check disk cache
                 result = self._load_result_from_disk(key)
@@ -343,7 +380,6 @@ class AutoTuner:
             # This encapsulates the logic of using either a custom supply program (`supply_prog`)
             # or the default profiler input generation (`profiler._get_inputs`).
             def get_input_tensors_supply(with_output: bool):
-
                 def func():
                     if supply_prog is not None:
                         return supply_prog(profiler._get_params(with_output=with_output))
@@ -361,8 +397,7 @@ class AutoTuner:
                     self.jit_input_tensors = jit_input_tensors_supply()
                 else:
                     # check if the cached tensors are compatible with the current configuration
-                    assert len(params) == len(
-                        self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
+                    assert len(params) == len(self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
                     for p, c in zip(params, self.jit_input_tensors):
                         if not isinstance(c, torch.Tensor):
                             # skip non-tensor inputs checking
@@ -371,8 +406,8 @@ class AutoTuner:
                         # Check tensor compatibility using generator expression
                         def shape_equal(a, b):
                             return all(
-                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var)
-                                for a_dim, b_dim in zip(a.shape, b.shape))
+                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var) for a_dim, b_dim in zip(a.shape, b.shape)
+                            )
 
                         if p.dtype != c.dtype or not shape_equal(p, c):
                             logger.warning(
@@ -383,7 +418,8 @@ class AutoTuner:
                                 "To ensure fresh, compatible inputs are generated for every trial "
                                 "you can disable caching by setting:\n"
                                 "  `cache_input_tensors=False`\n"
-                                "within your `.set_compile_args(...)` call.\n")
+                                "within your `.set_compile_args(...)` call.\n"
+                            )
                             # otherwise, regenerate the input tensors for safety
                             self.jit_input_tensors = jit_input_tensors_supply()
                             break
@@ -392,24 +428,16 @@ class AutoTuner:
 
             if (not skip_check) and (ref_prog is not None):
                 if manual_check_prog is not None:
-                    profiler.manual_assert_close(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        manual_check_prog=manual_check_prog)
+                    profiler.manual_assert_close(ref_prog, input_tensors=self.jit_input_tensors, manual_check_prog=manual_check_prog)
                 else:
                     profiler.assert_allclose(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        rtol=rtol,
-                        atol=atol,
-                        max_mismatched_ratio=max_mismatched_ratio)
-            latency = profiler.do_bench(
-                warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
+                        ref_prog, input_tensors=self.jit_input_tensors, rtol=rtol, atol=atol, max_mismatched_ratio=max_mismatched_ratio
+                    )
+            latency = profiler.do_bench(warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
 
             if self.ref_latency_cache is None and ref_prog is not None:
                 self.ref_input_tensors = ref_input_tensors_supply()
-                self.ref_latency_cache = profiler.do_bench(
-                    ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
+                self.ref_latency_cache = profiler.do_bench(ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
 
             return latency, self.ref_latency_cache
 
@@ -443,17 +471,14 @@ class AutoTuner:
 
             # Check if all tunable arguments have been tuned by comparing config keys with key_kwargs_tuple
             if any(key in top_config for key, _ in key_kwargs_tuple) or any(
-                    check_tunable_argument_value(key, self._function_parameters, key_args_tuple)
-                    for key in tunable_arguments):
+                check_tunable_argument_value(key, self._function_parameters, key_args_tuple) for key in tunable_arguments
+            ):
                 logger.warning(
                     f"Tunable parameters {tunable_arguments} already provided during auto-tuning. Skipping compilation and using direct JIT"
                 )
                 # compile the kernel with the provided parameters
                 jit_kernel = self.jit_compile()
-                autotuner_result = AutotuneResult(
-                    libcode=jit_kernel.get_kernel_source(),
-                    func=jit_kernel.prim_func,
-                    kernel=jit_kernel)
+                autotuner_result = AutotuneResult(libcode=jit_kernel.get_kernel_source(), func=jit_kernel.prim_func, kernel=jit_kernel)
                 self._memory_cache[key] = autotuner_result
                 return autotuner_result
         # get the cpu count
@@ -463,9 +488,7 @@ class AutoTuner:
         max_cpu_count = int(env.TILELANG_AUTO_TUNING_MAX_CPU_COUNT)
         if cpu_counts > 0:
             num_workers = min(cpu_counts, available_cpu_count)
-            logger.info(
-                f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used"
-            )
+            logger.info(f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used")
         else:
             num_workers = max(1, int(available_cpu_count * cpu_utilizations))
             logger.info(
@@ -483,7 +506,6 @@ class AutoTuner:
         future_to_index = {}
 
         def cuda_device_wrapper(func, device):
-
             def inner(**config_arg):
                 torch.cuda.set_device(device)
                 return func(**config_arg)
@@ -506,18 +528,14 @@ class AutoTuner:
             future_to_index[future] = i
 
         results_with_configs = []
-        for future in tqdm(
-                concurrent.futures.as_completed(futures),
-                total=len(futures),
-                desc="Compiling configurations"):
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Compiling configurations"):
             idx = future_to_index[future]
             config = config_args[idx]
             try:
                 result = future.result()
                 results_with_configs.append((result, config))
             except Exception as e:
-                logger.debug(
-                    f"Compilation failed for config {config} at index {idx} with error: {e}")
+                logger.debug(f"Compilation failed for config {config} at index {idx} with error: {e}")
                 continue
 
         ref_latency = None
@@ -530,14 +548,10 @@ class AutoTuner:
                 # latency, ref_latency = target_fn(jit_kernel)
                 latency, ref_latency = run_with_timeout(target_fn, timeout, jit_kernel)
             except TimeoutException:
-                logger.warning(
-                    f"A timeout occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"A timeout occurred while testing config {config}, checkout autotuner.log for more details")
                 continue
             except Exception:
-                logger.warning(
-                    f"An error occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"An error occurred while testing config {config}, checkout autotuner.log for more details")
                 logger.debug(f"Error: {traceback.format_exc()}")
                 continue
 
@@ -552,8 +566,7 @@ class AutoTuner:
         pool.shutdown()
 
         if best_kernel is None:
-            error_msg = ("Auto-tuning failed: No configuration successfully "
-                         "compiled and passed benchmarking/validation.")
+            error_msg = "Auto-tuning failed: No configuration successfully compiled and passed benchmarking/validation."
             logger.error(error_msg)
             raise RuntimeError(error_msg)
 
@@ -569,13 +582,14 @@ class AutoTuner:
             ref_latency=ref_latency,
             libcode=best_kernel.get_kernel_source(),
             func=best_kernel.prim_func,
-            kernel=best_kernel)
+            kernel=best_kernel,
+        )
 
-        if self.compile_args.execution_backend in ("dlpack", "torch"):
+        if self.compile_args.execution_backend in ("torch"):
             logger.warning("DLPack backend does not support cache saving to disk.")
         else:
             with self._lock:
-                if env.is_cache_enabled():
+                if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                     self._save_result_to_disk(key, autotuner_result)
 
         self._memory_cache[key] = autotuner_result
@@ -591,8 +605,8 @@ class AutoTuner:
         return self.run()
 
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
 
 
 @dataclass
@@ -617,8 +631,9 @@ class AutoTuneImpl(Generic[_P, _T]):
         self._tuner_cache = {}
 
     def get_tunner(self):
-        autotuner = AutoTuner(
-            self.jit_impl.func, configs=self.configs).set_profile_args(
+        autotuner = (
+            AutoTuner(self.jit_impl.func, configs=self.configs)
+            .set_profile_args(
                 supply_type=self.supply_type,
                 ref_prog=self.ref_prog,
                 supply_prog=self.supply_prog,
@@ -628,7 +643,8 @@ class AutoTuneImpl(Generic[_P, _T]):
                 skip_check=self.skip_check,
                 manual_check_prog=self.manual_check_prog,
                 cache_input_tensors=self.cache_input_tensors,
-            ).set_compile_args(
+            )
+            .set_compile_args(
                 out_idx=self.jit_impl.out_idx,
                 execution_backend=self.jit_impl.execution_backend,
                 target=self.jit_impl.target,
@@ -636,6 +652,7 @@ class AutoTuneImpl(Generic[_P, _T]):
                 verbose=self.jit_impl.verbose,
                 pass_configs=self.jit_impl.pass_configs,
             )
+        )
         autotuner.run = partial(autotuner.run, self.warmup, self.rep, self.timeout)
         return autotuner
 
@@ -708,8 +725,9 @@ def autotune(  # This is the new public interface
         Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
+    execution_backend : Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+        Backend for kernel execution and argument passing. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     verbose : bool, optional
         Enables verbose logging during compilation. Defaults to False.
     pass_configs : Optional[Dict[str, Any]], optional
@@ -726,16 +744,13 @@ def autotune(  # This is the new public interface
     if callable(func):
         # Case 1: Used as @autotune (func_or_out_idx is the function, others are defaults)
         # This is a placeholder for a real auto tuner implementation
-        raise ValueError(
-            "Use tilelang.autotune to decorate func without arguments is not supported yet.")
+        raise ValueError("Use tilelang.autotune to decorate func without arguments is not supported yet.")
     elif isinstance(func, PrimFunc):
         raise ValueError("Use tilelang.jit to decorate prim_func is not supported yet.")
     else:
 
         def decorator(impl):
-            assert isinstance(
-                impl, JITImpl
-            ), "The @autotune decorator can only be applied to @tilelang.jit decorated instances."
+            assert isinstance(impl, JITImpl), "The @autotune decorator can only be applied to @tilelang.jit decorated instances."
             return AutoTuneImpl(
                 jit_impl=impl,
                 configs=configs,
diff --git a/tilelang/cache/__init__.py b/tilelang/cache/__init__.py
index c338ce61d6cb5792b891bc8b14330943c497c83c..18ac847bf4adbad368b8f853cb152287415cdb9f 100644
--- a/tilelang/cache/__init__.py
+++ b/tilelang/cache/__init__.py
@@ -1,4 +1,5 @@
 """The cache utils with class and database persistence - Init file"""
+
 from __future__ import annotations
 
 from typing import Literal
@@ -18,7 +19,7 @@ def cached(
     *args,
     target: str | Target = "auto",
     target_host: str | Target = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] | None = "cython",
+    execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] | None = "auto",
     verbose: bool | None = False,
     pass_configs: dict | None = None,
     compile_flags: list[str] | str | None = None,
@@ -35,7 +36,8 @@ def cached(
         execution_backend=execution_backend,
         verbose=verbose,
         pass_configs=pass_configs,
-        compile_flags=compile_flags)
+        compile_flags=compile_flags,
+    )
 
 
 def clear_cache():
@@ -46,9 +48,11 @@ def clear_cache():
         RuntimeError: Always raised to warn users to clear the cache manually.
     """
     cache_dir = env.TILELANG_CACHE_DIR
-    raise RuntimeError("tilelang.clear_cache() is disabled because deleting the cache directory "
-                       "is dangerous. If you accept the risk, remove it manually with "
-                       f"`rm -rf '{cache_dir}'`.")
+    raise RuntimeError(
+        "tilelang.clear_cache() is disabled because deleting the cache directory "
+        "is dangerous. If you accept the risk, remove it manually with "
+        f"`rm -rf '{cache_dir}'`."
+    )
 
 
 if env.TILELANG_CLEAR_CACHE.lower() in ("1", "true", "yes", "on"):
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index d0a801fb45aa3935ca1430283ff4092f3ad5e7a5..cf6a5591b070e5ad96190c7e08fbdba825ae4202 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -1,4 +1,5 @@
 """The cache utils with class and database persistence - KernelCache Class"""
+
 from __future__ import annotations
 
 import json
@@ -13,19 +14,26 @@ from typing import Callable, Literal
 import cloudpickle
 from tvm.target import Target
 from tvm.tir import PrimFunc
-
+from tvm.runtime import Executable
 from tilelang.engine.param import KernelParam
+from tilelang.utils.language import get_prim_func_name
 from tilelang import env
 from tilelang.jit import JITKernel
 from tilelang import __version__
 
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
+DEVICE_KERNEL_PATH = "device_kernel.cu"
+HOST_KERNEL_PATH = "host_kernel.cu"
+EXECUTABLE_PATH = "executable.so"
 KERNEL_LIB_PATH = "kernel_lib.so"
 KERNEL_CUBIN_PATH = "kernel.cubin"
 KERNEL_PY_PATH = "kernel.py"
 PARAMS_PATH = "params.pkl"
 
+# CuTeDSL C++ launcher specific
+LAUNCHER_LIB_PATH = "launcher_lib.so"
+LAUNCHER_CPP_PATH = "launcher.cpp"
+CUTEDSL_CUBIN_PATH = "kernel.cubin"
+
 
 class KernelCache:
     """
@@ -40,7 +48,7 @@ class KernelCache:
     _instance = None  # For implementing singleton pattern
     _lock = threading.Lock()  # For thread safety
     _memory_cache = {}  # In-memory cache dictionary
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython"
+    execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi"
 
     def __new__(cls):
         """
@@ -69,7 +77,7 @@ class KernelCache:
         self,
         func: Callable,
         out_idx: list[int],
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         args=None,
         target: str | Target = "auto",
         target_host: str | Target = None,
@@ -82,7 +90,7 @@ class KernelCache:
         Args:
             func (Callable): The function to be compiled.
             out_idx (List[int]): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             args: Arguments passed to the function.
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
@@ -96,9 +104,7 @@ class KernelCache:
             "version": __version__,
             "func": sha256(func_binary).hexdigest(),  # Use SHA256 to generate hash key
             "out_idx": (tuple(out_idx) if isinstance(out_idx, (list, tuple)) else [out_idx]),
-            "args_repr": tuple(
-                repr(arg) for arg in args
-            ),  # Use repr to serialize arguments, may need more robust serialization
+            "args_repr": tuple(repr(arg) for arg in args),  # Use repr to serialize arguments, may need more robust serialization
             "target": str(target),
             "target_host": str(target_host) if target_host else None,
             "execution_backend": execution_backend,
@@ -117,7 +123,7 @@ class KernelCache:
         *args,
         target: str | Target = "auto",
         target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto",
         verbose: bool = False,
         pass_configs: dict = None,
         compile_flags: list[str] | str | None = None,
@@ -135,12 +141,31 @@ class KernelCache:
         Returns:
             JITKernel: The compiled kernel, either freshly compiled or from cache
         """
+        # Normalize target and resolve execution backend before proceeding
+        from tilelang.utils.target import determine_target as _determine_target
+        from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+
+        norm_target = Target(_determine_target(target)) if isinstance(target, str) else target
+        requested_backend = execution_backend
+        execution_backend = resolve_execution_backend(requested_backend, norm_target)
+        if verbose:
+            allowed_now = allowed_backends_for_target(norm_target, include_unavailable=False)
+            # Avoid duplicate logs when caller already resolved explicitly
+            if requested_backend in (None, "auto") or requested_backend != execution_backend:
+                self.logger.info(
+                    "Execution backend resolved -> '%s' (requested='%s', target='%s', allowed: %s)",
+                    execution_backend,
+                    requested_backend,
+                    norm_target.kind.name,
+                    ", ".join(sorted(allowed_now)),
+                )
+
         if not env.is_cache_enabled():
             return JITKernel(
                 func,
                 out_idx=out_idx,
                 execution_backend=execution_backend,
-                target=target,
+                target=norm_target,
                 target_host=target_host,
                 verbose=verbose,
                 pass_configs=pass_configs,
@@ -152,7 +177,7 @@ class KernelCache:
             out_idx=out_idx,
             execution_backend=execution_backend,
             args=args,
-            target=target,
+            target=norm_target,
             target_host=target_host,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
@@ -160,44 +185,48 @@ class KernelCache:
         with self._lock:
             # First check in-memory cache
             if key in self._memory_cache:
-                self.logger.warning("Found kernel in memory cache. For better performance," \
-                                    " consider using `@tilelang.jit` instead of direct kernel caching.")
+                # Include kernel name for easier debugging when hitting memory cache
+                kernel_name = get_prim_func_name(func, "<unknown>")
+                self.logger.warning(
+                    "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.",
+                    kernel_name,
+                )
                 return self._memory_cache[key]
 
             if verbose:
-                self.logger.debug(f"Checking disk cache for kernel {func.attrs['global_symbol']}")
+                self.logger.debug(f"Checking disk cache for kernel {get_prim_func_name(func, '<unknown>')}")
 
             # Then check disk cache
-            kernel = self._load_kernel_from_disk(key, target, target_host, out_idx,
-                                                 execution_backend, pass_configs, compile_flags,
-                                                 func, verbose)
+            kernel = self._load_kernel_from_disk(
+                key, norm_target, target_host, out_idx, execution_backend, pass_configs, compile_flags, func, verbose
+            )
             if kernel is not None:
                 if verbose:
-                    self.logger.debug(
-                        f"Found kernel in disk cache for {func.attrs['global_symbol']}")
+                    self.logger.debug(f"Found kernel in disk cache for {get_prim_func_name(func, '<unknown>')}")
                 # Populate memory cache with disk result
                 self._memory_cache[key] = kernel
                 return kernel
 
         if verbose:
-            self.logger.debug(f"No cached kernel for {func.attrs['global_symbol']}")
+            self.logger.debug(f"No cached kernel for {get_prim_func_name(func, '<unknown>')}")
         # Compile kernel if cache miss; leave critical section
         kernel = JITKernel(
             func,
             out_idx=out_idx,
             execution_backend=execution_backend,
-            target=target,
+            target=norm_target,
             target_host=target_host,
             verbose=verbose,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
         )
-        if execution_backend in ("dlpack", "torch"):
-            self.logger.warning("DLPack or torch backend does not support cache saving to disk.")
-        else:
-            with self._lock:
-                if env.is_cache_enabled():
-                    self._save_kernel_to_disk(key, kernel, func, verbose)
+        with self._lock:
+            if env.is_cache_enabled():
+                cache_path = self._get_cache_path(key)
+                self._save_kernel_to_disk(key, kernel, func, verbose)
+                # Set cache path on adapter so it can save cubin after first execution
+                if hasattr(kernel, "adapter") and execution_backend == "cutedsl":
+                    kernel.adapter._cache_path = cache_path
 
         # Store in memory cache after compilation
         self._memory_cache[key] = kernel
@@ -239,11 +268,13 @@ class KernelCache:
         # Use atomic POSIX replace, so other processes cannot see a partial write
         os.replace(temp_path, path)
 
-    def _save_kernel_to_disk(self,
-                             key: str,
-                             kernel: JITKernel,
-                             func: Callable = None,
-                             verbose: bool = False):
+    @staticmethod
+    def _safe_write_executable(executable: Executable, path: str):
+        temp_path = os.path.join(env.TILELANG_TMP_DIR, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path)
+        os.replace(temp_path, path)
+
+    def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = None, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
 
@@ -265,71 +296,103 @@ class KernelCache:
 
         # Save kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
-            if verbose:
-                self.logger.debug(f"Saving kernel source code to file: {kernel_path}")
-            if kernel.kernel_source is not None:
-                KernelCache._safe_write_file(kernel_path, "w",
-                                             lambda file: file.write(kernel.kernel_source))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel source code to disk: {e}")
+            if self.execution_backend != "cutedsl":
+                device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+                if verbose:
+                    self.logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
+                if kernel.kernel_source is not None:
+                    KernelCache._safe_write_file(device_kernel_path, "w", lambda file: file.write(kernel.kernel_source))
+        except Exception:
+            self.logger.exception("Error saving kernel source code to disk")
 
         # Save wrapped kernel source code
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH if self.execution_backend != "cutedsl" else KERNEL_PY_PATH)
             if verbose:
-                self.logger.debug(
-                    f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            KernelCache._safe_write_file(
-                wrapped_kernel_path, "w",
-                lambda file: file.write(kernel.adapter.get_kernel_source()))
-        except Exception as e:
-            self.logger.error(f"Error saving wrapped kernel source code to disk: {e}")
+                self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+            if self.execution_backend == "tvm_ffi":
+                KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_host_source()))
+            else:
+                KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_kernel_source()))
+        except Exception:
+            self.logger.exception("Error saving host kernel source code to disk")
 
         # Save the kernel library
         try:
             # Save CUBIN or SO file
-            kernel_lib_path = KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH
-            kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            KernelCache._safe_write_file(
-                kernel_lib_path, "wb",
-                lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-
-            # Save an extra Python file for NVRTC
-            if self.execution_backend == "nvrtc":
-                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
-                src_lib_path = src_lib_path.replace(".cubin", ".py")
-                if verbose:
-                    self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
-                KernelCache._safe_write_file(
-                    kernel_py_path, "wb",
-                    lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel library to disk: {e}")
+            if self.execution_backend == "cutedsl":
+                # For CuTeDSL, kernel_lib_path is the Python module
+                kernel_lib_path = os.path.join(cache_path, KERNEL_PY_PATH)
+
+                # Save C++ launcher library if it exists
+                lib_gen = getattr(kernel.adapter, "lib_generator", None)
+                if lib_gen and hasattr(lib_gen, "launcher_libpath") and lib_gen.launcher_libpath:
+                    launcher_lib_path = os.path.join(cache_path, LAUNCHER_LIB_PATH)
+                    src_launcher_path = lib_gen.launcher_libpath
+                    if verbose:
+                        self.logger.debug(f"Saving C++ launcher library to cache: {src_launcher_path}")
+                    KernelCache._safe_write_file(
+                        launcher_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_launcher_path))
+                    )
+
+                # Optionally save launcher C++ source for debugging
+                if hasattr(kernel.adapter, "launcher_cpp_code") and kernel.adapter.launcher_cpp_code:
+                    launcher_cpp_path = os.path.join(cache_path, LAUNCHER_CPP_PATH)
+                    if verbose:
+                        self.logger.debug(f"Saving C++ launcher source to: {launcher_cpp_path}")
+                    KernelCache._safe_write_file(launcher_cpp_path, "w", lambda file: file.write(kernel.adapter.launcher_cpp_code))
+
+            else:
+                if self.execution_backend == "nvrtc":
+                    kernel_lib_path = KERNEL_CUBIN_PATH
+                elif self.execution_backend == "tvm_ffi":
+                    kernel_lib_path = EXECUTABLE_PATH
+                else:
+                    kernel_lib_path = KERNEL_LIB_PATH
+                kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
+
+                # Save an extra Python file for NVRTC
+                if self.execution_backend == "nvrtc":
+                    src_lib_path = kernel.adapter.libpath
+                    kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
+                    src_lib_path = src_lib_path.replace(".cubin", ".py")
+                    if verbose:
+                        self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+                    KernelCache._safe_write_file(kernel_py_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+
+                if self.execution_backend == "tvm_ffi":
+                    executable = kernel.adapter.executable
+                    if verbose:
+                        self.logger.debug(f"Saving kernel executable to file: {executable}")
+                    KernelCache._safe_write_executable(executable, kernel_lib_path)
+                else:
+                    src_lib_path = kernel.adapter.libpath
+                    if verbose:
+                        self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                    KernelCache._safe_write_file(kernel_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+
+        except Exception:
+            self.logger.exception("Error saving kernel library to disk")
 
         # Save kernel parameters
         try:
             params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 self.logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            KernelCache._safe_write_file(params_path, "wb",
-                                         lambda file: cloudpickle.dump(kernel.params, file))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel parameters to disk: {e}")
+            KernelCache._safe_write_file(params_path, "wb", lambda file: cloudpickle.dump(kernel.params, file))
+        except Exception:
+            self.logger.exception("Error saving kernel parameters to disk")
 
     def _load_kernel_from_disk(
         self,
         key: str,
         target: str | Target = "auto",
-        target_host: str | Target = None,
-        out_idx: list[int] = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        pass_configs: dict = None,
+        target_host: str | Target | None = None,
+        out_idx: list[int] | None = None,
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
+        pass_configs: dict | None = None,
         compile_flags: list[str] | str | None = None,
-        func: Callable = None,
+        func: Callable | None = None,
         verbose: bool = False,
     ) -> JITKernel | None:
         """
@@ -340,7 +403,7 @@ class KernelCache:
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
             out_idx (List[int], optional): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             pass_configs (dict, optional): Configuration for compiler passes.
             func (Callable, optional): The original function.
             verbose (bool): Enable verbose log messages.
@@ -349,25 +412,53 @@ class KernelCache:
             JITKernel: The loaded kernel if found, None otherwise.
         """
         cache_path = self._get_cache_path(key)
-        wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
-        kernel_lib_path = os.path.join(
-            cache_path, KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH)
+        device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+        host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+        if self.execution_backend == "nvrtc":
+            kernel_lib_path = KERNEL_CUBIN_PATH
+        elif self.execution_backend == "tvm_ffi":
+            kernel_lib_path = EXECUTABLE_PATH
+        elif self.execution_backend == "cutedsl":
+            kernel_lib_path = KERNEL_PY_PATH
+        else:
+            kernel_lib_path = KERNEL_LIB_PATH
+        kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
         params_path = os.path.join(cache_path, PARAMS_PATH)
-        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
+
+        # Check required files exist
+        required_files = [kernel_lib_path, params_path]
+
+        # For CuTeDSL, also check launcher library
+        if self.execution_backend == "cutedsl":
+            required_files.append(os.path.join(cache_path, LAUNCHER_LIB_PATH))
+
+        if not all([os.path.exists(file) for file in required_files]):
             return None
 
-        kernel_global_source: str | None = None
+        device_kernel_source: str | None = None
+        host_kernel_source: str | None = None
         kernel_params: list[KernelParam] | None = None
 
         # Load the kernel source file (optional)
-        try:
-            if verbose:
-                self.logger.debug(
-                    f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
-        except Exception as e:
-            self.logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+        if self.execution_backend != "cutedsl":
+            try:
+                if verbose:
+                    self.logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+                with open(device_kernel_path) as f:
+                    device_kernel_source = f.read()
+            except Exception:
+                self.logger.exception("Error loading kernel source code from disk")
+            try:
+                if verbose:
+                    self.logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+                with open(host_kernel_path) as f:
+                    host_kernel_source = f.read()
+            except Exception:
+                self.logger.exception("Error loading host kernel source code from disk")
+        else:
+            # For CuTeDSL, set empty strings since sources aren't loaded from cache
+            device_kernel_source = ""
+            host_kernel_source = ""
 
         # Load kernel parameters
         try:
@@ -375,13 +466,14 @@ class KernelCache:
                 self.logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
                 kernel_params = cloudpickle.load(f)
-        except Exception as e:
-            self.logger.error(f"Error loading kernel parameters from disk: {e}")
+        except Exception:
+            self.logger.exception("Error loading kernel parameters from disk")
 
-        if kernel_global_source and kernel_params:
+        if ((host_kernel_source and device_kernel_source) or self.execution_backend == "cutedsl") and kernel_params:
             return JITKernel.from_database(
                 func=func,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 params=kernel_params,
                 target=target,
@@ -392,6 +484,7 @@ class KernelCache:
                 compile_flags=compile_flags,
             )
         else:
+            # TODO(lei): report what the reason is.
             return None
 
     def _clear_disk_cache(self):
@@ -408,5 +501,5 @@ class KernelCache:
 
             # Re-create the cache directory
             KernelCache._create_dirs()
-        except Exception as e:
-            self.logger.error(f"Error clearing disk cache: {e}")
+        except Exception:
+            self.logger.exception("Error clearing disk cache")
diff --git a/tilelang/carver/README.md b/tilelang/carver/README.md
index 498cf15714dfa941512d19a3961b4acaa3b057e3..65030d0a22ebe4e8415017bed45978b1185c35c6 100644
--- a/tilelang/carver/README.md
+++ b/tilelang/carver/README.md
@@ -2,7 +2,7 @@
 
 **Carver** is a lightweight framework for generating and ranking tile configurations (also known as **tiling strategies**, **blocking schemes**, or **scheduling hints**) for common GPU, CPU, and accelerator backends. It helps you explore efficient mappings of loops for operations such as matrix multiplication, elementwise transforms, and other reduction-oriented kernels. 
 
-Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/LeiYanggh/tilelang) (or other domain-specific compilers).
+Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/tile-ai/tilelang) (or other domain-specific compilers).
 
 ---
 
diff --git a/tilelang/carver/__init__.py b/tilelang/carver/__init__.py
index 4ffd43644da36eb9ece7502fa91a9d0a1a634cbb..f1dfc5b4750d23e679535b11c34b1bfe7ceb6576 100644
--- a/tilelang/carver/__init__.py
+++ b/tilelang/carver/__init__.py
@@ -1,4 +1,5 @@
 """Base infra"""
+
 from .analysis import (
     BlockInfo,  # noqa: F401
     IterInfo,  # noqa: F401
diff --git a/tilelang/carver/analysis.py b/tilelang/carver/analysis.py
index 96606e790ef37291b84213ab36784a1b4bdb70d3..6ca9168185f2d3149e273cbe2e3517d385eda204 100644
--- a/tilelang/carver/analysis.py
+++ b/tilelang/carver/analysis.py
@@ -1,4 +1,5 @@
 """Analysis on TIR blocks, loops and functions."""
+
 from __future__ import annotations
 from typing_extensions import Literal
 
@@ -144,11 +145,13 @@ def normalize_prim_func(sch: tir.Schedule) -> list[BlockInfo] | None:
                         var=iter.var,
                         dom=iter.dom,
                         loop_rv=loop,
-                    ) for loop, iter in zip(loops, iters)
+                    )
+                    for loop, iter in zip(loops, iters)
                 ],
                 block_rv=block,
                 reduction_block=is_reduction,
-            ))
+            )
+        )
     return blocks
 
 
@@ -188,8 +191,7 @@ def get_max_shared_memory_per_block(target: Target) -> int:
     _assert_gpu_target(target)
     max_shared_memory_per_block = target.attrs.get("max_shared_memory_per_block", None)
     if max_shared_memory_per_block is None:
-        raise ValueError(
-            f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
+        raise ValueError(f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
     return int(max_shared_memory_per_block)
 
 
@@ -197,13 +199,11 @@ def get_root_block(sch: Schedule, func_name: str = "main") -> BlockRV:
     try:
         block = sch.mod[func_name].body.block
     except Exception:
-        raise ValueError(f"The function body is expected to be the root block, but got:\n"
-                         f"{sch.mod[func_name].body}") from None
+        raise ValueError(f"The function body is expected to be the root block, but got:\n{sch.mod[func_name].body}") from None
     return sch.get_block(block.name_hint)
 
 
-def collect_block_iter_vars_used_in_access_region(block: tir.Block,
-                                                  region: list[ir.Range]) -> set[tir.Var]:
+def collect_block_iter_vars_used_in_access_region(block: tir.Block, region: list[ir.Range]) -> set[tir.Var]:
     """Collect the block iter variables used in the access region of a buffer region."""
     tir_vars = set()
     for expr in region:
@@ -251,15 +251,13 @@ def is_broadcast_epilogue(
     for buffer_region in sch.get(epilogue).reads:
         if buffer_region.buffer not in write_buffers:
             continue
-        tir_vars = collect_block_iter_vars_used_in_access_region(
-            sch.get(epilogue), buffer_region.region)
+        tir_vars = collect_block_iter_vars_used_in_access_region(sch.get(epilogue), buffer_region.region)
         if len(tir_vars) < len(epilogue_iters):
             return True
     return False
 
 
-def get_reduction_blocks(sch: tir.Schedule,
-                         blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
+def get_reduction_blocks(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
     # Get the main computation block
     def is_reduction(block: BlockRV) -> bool:
         block_stmt = sch.get(block)
diff --git a/tilelang/carver/arch/__init__.py b/tilelang/carver/arch/__init__.py
index c2bc9c75d54c670e96a2d8bd558a8af955eba5d5..b6cb9e72f7243c46b9d13636e91b37ab887a0625 100644
--- a/tilelang/carver/arch/__init__.py
+++ b/tilelang/carver/arch/__init__.py
@@ -39,18 +39,18 @@ def auto_infer_current_arch() -> TileDevice:
 
 
 __all__ = [
-    'is_cpu_arch',
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
-    'is_cdna_arch',
-    'is_metal_arch',
-    'CUDA',
-    'CDNA',
-    'METAL',
-    'CPU',
+    "is_cpu_arch",
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
+    "is_cdna_arch",
+    "is_metal_arch",
+    "CUDA",
+    "CDNA",
+    "METAL",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/arch_base.py b/tilelang/carver/arch/arch_base.py
index a10fa434d8707867e430f7f66c6471f67fb8a6c1..c5e9dfa683c0bb357008b2748ae280b2c52ee04a 100644
--- a/tilelang/carver/arch/arch_base.py
+++ b/tilelang/carver/arch/arch_base.py
@@ -1,6 +1,3 @@
-from __future__ import annotations
-
-
 class TileDevice:
     """
     Represents the architecture of a computing device, capturing various hardware specifications.
@@ -10,9 +7,7 @@ class TileDevice:
         self.reg_cap: int = 0  # Register capacity: The amount of register memory available
         self.smem_cap: int = 0  # Shared memory capacity: The amount of shared memory available
         self.compute_max_core: int = 0  # The maximum number of computing cores
-        self.warp_size: int = (
-            0  # The size of a warp, a group of threads that execute instructions in lockstep
-        )
+        self.warp_size: int = 0  # The size of a warp, a group of threads that execute instructions in lockstep
         self.sm_partition: int = 0  # The number of streaming multiprocessor partitions
         self.transaction_size: list[int] = [
             0,
@@ -24,9 +19,7 @@ class TileDevice:
             0,
         ]  # Bandwidth specifications, possibly including peak and sustained rates
         self.platform: str = "unknown"  # The platform or manufacturer of the device
-        self.compute_capability: str = (
-            "unknown"  # The compute capability, indicating the feature set and performance level
-        )
+        self.compute_capability: str = "unknown"  # The compute capability, indicating the feature set and performance level
         self.l2_cache_size_bytes: int = 0
         # the number of transaction size in bytes
         self.transaction_size: list[int] = [0, 0]  # in bytes
diff --git a/tilelang/carver/arch/cdna.py b/tilelang/carver/arch/cdna.py
index ec5aa905fe27c6fad7cfbb618819436c6823edf4..5c2d4c4ed6722e2577a2755d2af73ff76eabbf41 100644
--- a/tilelang/carver/arch/cdna.py
+++ b/tilelang/carver/arch/cdna.py
@@ -9,7 +9,6 @@ def is_cdna_arch(arch: TileDevice) -> bool:
 
 
 class CDNA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -33,6 +32,6 @@ class CDNA(TileDevice):
 
 
 __all__ = [
-    'is_cdna_arch',
-    'CDNA',
+    "is_cdna_arch",
+    "CDNA",
 ]
diff --git a/tilelang/carver/arch/cpu.py b/tilelang/carver/arch/cpu.py
index f4643baa0791f7dc621d37863292459ec15cef43..fc18c6c8b35e703c1efe98b4ed1378868823d741 100644
--- a/tilelang/carver/arch/cpu.py
+++ b/tilelang/carver/arch/cpu.py
@@ -10,7 +10,6 @@ def is_cpu_arch(arch: TileDevice) -> bool:
 # For LLVM Backend, we do not provide the detailed information of the CPU
 # As the LLVM backend do not required tuning, just maintain the consistency
 class CPU(TileDevice):
-
     def __init__(self, target: Target):
         self.target = target
         device = tvm.runtime.cpu(0)
@@ -21,6 +20,6 @@ class CPU(TileDevice):
 
 
 __all__ = [
-    'is_cpu_arch',
-    'CPU',
+    "is_cpu_arch",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/cuda.py b/tilelang/carver/arch/cuda.py
index 4c7f98dffdab95272325ccefdb9c6fb5dd1830da..2b79b2832b223469f5f3aa8b6fc94d1448acd2c3 100644
--- a/tilelang/carver/arch/cuda.py
+++ b/tilelang/carver/arch/cuda.py
@@ -78,7 +78,6 @@ hopper_tensorcore_supported = ada_tensorcore_supported
 # instead of assuming both a and b share the same dtype.
 # As the tensorcore may supports float8_e4m3 * float8_e5m2
 def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: TileDevice) -> bool:
-
     if is_volta_arch(arch):
         return (in_dtype, accum_dtype) in volta_tensorcore_supported
     elif is_ampere_arch(arch):
@@ -92,7 +91,6 @@ def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: Til
 
 
 class TensorInstruction:
-
     def __init__(
         self,
         name: str,
@@ -104,7 +102,6 @@ class TensorInstruction:
 
 
 class CUDA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -148,12 +145,12 @@ class CUDA(TileDevice):
 
 
 __all__ = [
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
     "CUDA",
 ]
diff --git a/tilelang/carver/arch/driver/cuda_driver.py b/tilelang/carver/arch/driver/cuda_driver.py
index 337987dd80a4e8aea373d8a202ff6a631817cef8..a631276635f6d53df368271a65ab6c84926c1f62 100644
--- a/tilelang/carver/arch/driver/cuda_driver.py
+++ b/tilelang/carver/arch/driver/cuda_driver.py
@@ -2,123 +2,54 @@ from __future__ import annotations
 import ctypes
 import sys
 
+try:
+    import torch.cuda._CudaDeviceProperties as _CudaDeviceProperties
+except ImportError:
+    _CudaDeviceProperties = type("DummyCudaDeviceProperties", (), {})
 
-class cudaDeviceProp(ctypes.Structure):
-    _fields_ = [
-        ("name", ctypes.c_char * 256),
-        ("uuid", ctypes.c_byte * 16),  # cudaUUID_t
-        ("luid", ctypes.c_char * 8),
-        ("luidDeviceNodeMask", ctypes.c_uint),
-        ("totalGlobalMem", ctypes.c_size_t),
-        ("sharedMemPerBlock", ctypes.c_size_t),
-        ("regsPerBlock", ctypes.c_int),
-        ("warpSize", ctypes.c_int),
-        ("memPitch", ctypes.c_size_t),
-        ("maxThreadsPerBlock", ctypes.c_int),
-        ("maxThreadsDim", ctypes.c_int * 3),
-        ("maxGridSize", ctypes.c_int * 3),
-        ("clockRate", ctypes.c_int),
-        ("totalConstMem", ctypes.c_size_t),
-        ("major", ctypes.c_int),
-        ("minor", ctypes.c_int),
-        ("textureAlignment", ctypes.c_size_t),
-        ("texturePitchAlignment", ctypes.c_size_t),
-        ("deviceOverlap", ctypes.c_int),
-        ("multiProcessorCount", ctypes.c_int),
-        ("kernelExecTimeoutEnabled", ctypes.c_int),
-        ("integrated", ctypes.c_int),
-        ("canMapHostMemory", ctypes.c_int),
-        ("computeMode", ctypes.c_int),
-        ("maxTexture1D", ctypes.c_int),
-        ("maxTexture1DMipmap", ctypes.c_int),
-        ("maxTexture1DLinear", ctypes.c_int),
-        ("maxTexture2D", ctypes.c_int * 2),
-        ("maxTexture2DMipmap", ctypes.c_int * 2),
-        ("maxTexture2DLinear", ctypes.c_int * 3),
-        ("maxTexture2DGather", ctypes.c_int * 2),
-        ("maxTexture3D", ctypes.c_int * 3),
-        ("maxTexture3DAlt", ctypes.c_int * 3),
-        ("maxTextureCubemap", ctypes.c_int),
-        ("maxTexture1DLayered", ctypes.c_int * 2),
-        ("maxTexture2DLayered", ctypes.c_int * 3),
-        ("maxTextureCubemapLayered", ctypes.c_int * 2),
-        ("maxSurface1D", ctypes.c_int),
-        ("maxSurface2D", ctypes.c_int * 2),
-        ("maxSurface3D", ctypes.c_int * 3),
-        ("maxSurface1DLayered", ctypes.c_int * 2),
-        ("maxSurface2DLayered", ctypes.c_int * 3),
-        ("maxSurfaceCubemap", ctypes.c_int),
-        ("maxSurfaceCubemapLayered", ctypes.c_int * 2),
-        ("surfaceAlignment", ctypes.c_size_t),
-        ("concurrentKernels", ctypes.c_int),
-        ("ECCEnabled", ctypes.c_int),
-        ("pciBusID", ctypes.c_int),
-        ("pciDeviceID", ctypes.c_int),
-        ("pciDomainID", ctypes.c_int),
-        ("tccDriver", ctypes.c_int),
-        ("asyncEngineCount", ctypes.c_int),
-        ("unifiedAddressing", ctypes.c_int),
-        ("memoryClockRate", ctypes.c_int),
-        ("memoryBusWidth", ctypes.c_int),
-        ("l2CacheSize", ctypes.c_int),
-        ("persistingL2CacheMaxSize", ctypes.c_int),
-        ("maxThreadsPerMultiProcessor", ctypes.c_int),
-        ("streamPrioritiesSupported", ctypes.c_int),
-        ("globalL1CacheSupported", ctypes.c_int),
-        ("localL1CacheSupported", ctypes.c_int),
-        ("sharedMemPerMultiprocessor", ctypes.c_size_t),
-        ("regsPerMultiprocessor", ctypes.c_int),
-        ("managedMemory", ctypes.c_int),
-        ("isMultiGpuBoard", ctypes.c_int),
-        ("multiGpuBoardGroupID", ctypes.c_int),
-        ("reserved2", ctypes.c_int * 2),
-        ("reserved1", ctypes.c_int * 1),
-        ("reserved", ctypes.c_int * 60)
-    ]
-
-
-def get_cuda_device_properties(device_id: int = 0) -> cudaDeviceProp | None:
-
-    if sys.platform == "win32":
-        libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
-    else:
-        libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
-
-    prop = cudaDeviceProp()
-    cudaGetDeviceProperties = libcudart.cudaGetDeviceProperties
-    cudaGetDeviceProperties.argtypes = [ctypes.POINTER(cudaDeviceProp), ctypes.c_int]
-    cudaGetDeviceProperties.restype = ctypes.c_int
-    ret = cudaGetDeviceProperties(ctypes.byref(prop), device_id)
-    if ret == 0:
-        return prop
-    else:
-        raise RuntimeError(f"cudaGetDeviceProperties failed with error {ret}")
+
+class cudaDeviceAttrNames:
+    r"""
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxRegistersPerBlock: int = 12
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+def get_cuda_device_properties(device_id: int = 0) -> _CudaDeviceProperties | None:
+    try:
+        import torch.cuda
+
+        if not torch.cuda.is_available():
+            return None
+        return torch.cuda.get_device_properties(torch.device(device_id))
+    except ImportError:
+        return None
 
 
 def get_device_name(device_id: int = 0) -> str | None:
     prop = get_cuda_device_properties(device_id)
     if prop:
-        return prop.name.decode()
-    else:
-        raise RuntimeError("Failed to get device properties.")
+        return prop.name
 
 
 def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> int | None:
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerBlock)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
-    else:
+    if prop is None:
         raise RuntimeError("Failed to get device properties.")
+    shared_mem = int(prop.shared_memory_per_block)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
+    else:
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_device_attribute(attr: int, device_id: int = 0) -> int:
@@ -130,7 +61,11 @@ def get_device_attribute(attr: int, device_id: int = 0) -> int:
 
         value = ctypes.c_int()
         cudaDeviceGetAttribute = libcudart.cudaDeviceGetAttribute
-        cudaDeviceGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
+        cudaDeviceGetAttribute.argtypes = [
+            ctypes.POINTER(ctypes.c_int),
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
         cudaDeviceGetAttribute.restype = ctypes.c_int
 
         ret = cudaDeviceGetAttribute(ctypes.byref(value), attr, device_id)
@@ -148,28 +83,20 @@ def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes")
     Get the maximum dynamic shared memory size in bytes, kilobytes, or megabytes.
     """
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerMultiprocessor)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
+    shared_mem = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor, device_id)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
     else:
-        raise RuntimeError("Failed to get device properties.")
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.persistingL2CacheMaxSize
-    else:
-        raise RuntimeError("Failed to get device properties for persisting L2 cache max size.")
+    prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
+    return prop
 
 
 def get_num_sms(device_id: int = 0) -> int:
@@ -186,15 +113,17 @@ def get_num_sms(device_id: int = 0) -> int:
         RuntimeError: If unable to get the device properties.
     """
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.multiProcessorCount
-    else:
+    if prop is None:
         raise RuntimeError("Failed to get device properties.")
+    return prop.multi_processor_count
 
 
 def get_registers_per_block(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.regsPerBlock
-    else:
-        raise RuntimeError("Failed to get device properties.")
+    """
+    Get the maximum number of 32-bit registers available per block.
+    """
+    prop = get_device_attribute(
+        cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock,
+        device_id,
+    )
+    return prop
diff --git a/tilelang/carver/arch/metal.py b/tilelang/carver/arch/metal.py
index 9cd1c4d1e3f42329169bdfc7dcb5dfcb376c36d3..0b76849a7695175a113b14cde3db19605de37006 100644
--- a/tilelang/carver/arch/metal.py
+++ b/tilelang/carver/arch/metal.py
@@ -8,7 +8,6 @@ def is_metal_arch(arch: TileDevice) -> bool:
 
 
 class METAL(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = Target(target)
@@ -16,6 +15,6 @@ class METAL(TileDevice):
 
 
 __all__ = [
-    'is_metal_arch',
-    'METAL',
+    "is_metal_arch",
+    "METAL",
 ]
diff --git a/tilelang/carver/common_schedules.py b/tilelang/carver/common_schedules.py
index 2766a15e381d0557cb400dfff8486f30b6b6caa5..4904b770dd6a5c83e219fea80061ed4f9fb5b46b 100644
--- a/tilelang/carver/common_schedules.py
+++ b/tilelang/carver/common_schedules.py
@@ -19,7 +19,7 @@
 # Modifications Copyright (c) Microsoft.
 # The code below is mostly copied from apache/tvm common_schedules.py in dlight.
 """Common schedule strategies for TIR."""
-from __future__ import annotations
+
 from typing import Callable
 
 from tvm import tir
diff --git a/tilelang/carver/matmul_analysis.py b/tilelang/carver/matmul_analysis.py
index 02a86cc78c375fcf39a3af12a2ad73451867c090..6d27de8253c12cdebc3ba7f50ec66cb702148b63 100644
--- a/tilelang/carver/matmul_analysis.py
+++ b/tilelang/carver/matmul_analysis.py
@@ -1,5 +1,6 @@
 # pylint: disable=missing-docstring, invalid-name
 """A GEMM schedule rule for GPU operators."""
+
 from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum
@@ -157,8 +158,7 @@ def find_last_producer_from_buffer(sch, main_block, buffer: tir.Buffer) -> Block
     return block
 
 
-def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV,
-                                   buffer: tir.Buffer) -> int:
+def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV, buffer: tir.Buffer) -> int:
     """traverse to find the arg index from the buffer"""
     producers = sch.get_producers(main_block)
 
@@ -226,9 +226,7 @@ def make_iter_fusion_index_map(
         else:
             fused_iters[trait.kind] = v_i
 
-    final_indices: list[tir.PrimExpr] = [
-        fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order
-    ]
+    final_indices: list[tir.PrimExpr] = [fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order]
 
     return tir.IndexMap(input_iters, final_indices, None)
 
@@ -307,8 +305,7 @@ def detect_iter_traits(block: tir.Block) -> tuple[list[IterTrait]] | None:
     return A_traits, B_traits, C_traits, block_traits
 
 
-def get_index_map(block: tir.Block,
-                  layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
+def get_index_map(block: tir.Block, layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
     """Get index maps for the block
 
     Parameters
@@ -343,10 +340,7 @@ def get_index_map(block: tir.Block,
         return axes
 
     def is_common_reduce(var: Var) -> bool:
-        for iter_var in block.iter_vars:
-            if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                return True
-        return False
+        return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block.iter_vars)
 
     def has_common_reduce(var: Var) -> bool:
         vars = collect_vars_from_expr(var)
@@ -384,17 +378,17 @@ def get_index_map(block: tir.Block,
             if kind == "C":
                 return [IterKind.kIter_S, primary_iter, secondary_iter]
             else:
-                return ([IterKind.kIter_S, spatial_iter, reduction_iter] if check_last_trait(region)
-                        else [IterKind.kIter_S, reduction_iter, spatial_iter])
+                return (
+                    [IterKind.kIter_S, spatial_iter, reduction_iter]
+                    if check_last_trait(region)
+                    else [IterKind.kIter_S, reduction_iter, spatial_iter]
+                )
         else:
             raise ValueError(f"Unknown layout {layout}")
 
-    A_index_map = make_iter_fusion_index_map(
-        A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
-    B_index_map = make_iter_fusion_index_map(
-        B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
-    C_index_map = make_iter_fusion_index_map(
-        C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
+    A_index_map = make_iter_fusion_index_map(A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
+    B_index_map = make_iter_fusion_index_map(B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
+    C_index_map = make_iter_fusion_index_map(C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
 
     matmul_index_map = make_iter_fusion_index_map(
         block_traits,
@@ -429,8 +423,7 @@ def get_dequantize_block(sch, blocks) -> BlockRV | None:
         has_uint_input = any("uint" in str(region.buffer.dtype) for region in block_stmt.reads)
         if not has_uint_input:
             return False
-        return not (len(block_stmt.writes) != 1 or
-                    "float" not in str(block_stmt.writes[0].buffer.dtype))
+        return not (len(block_stmt.writes) != 1 or "float" not in str(block_stmt.writes[0].buffer.dtype))
 
     dequantize_blocks = [block for block in blocks if is_dequantize(block)]
     return dequantize_blocks[0] if len(dequantize_blocks) == 1 else None
@@ -452,8 +445,7 @@ def is_identity_or_transpose_block(block_stmt: tir.Block) -> bool:
                 return None
             axes.extend(undefined_vars(r.min))
         # remove trivial axis
-        trivial_vars = set(
-            iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
+        trivial_vars = set(iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
         axes = [axis for axis in axes if axis not in trivial_vars]
         # remove duplicate axis
         axes = [var for i, var in enumerate(axes) if i == 0 or var != axes[i - 1]]
@@ -462,8 +454,7 @@ def is_identity_or_transpose_block(block_stmt: tir.Block) -> bool:
     lhs_access_vars = get_access_vars(block_stmt.reads[0].region)[-2:]
     rhs_access_vars = get_access_vars(block_stmt.writes[0].region)[-2:]
     is_identity = list(lhs_access_vars) == list(rhs_access_vars)
-    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(
-        rhs_access_vars)
+    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(rhs_access_vars)
     return is_identity, is_transpose
 
 
@@ -491,9 +482,7 @@ def inline_transpose_block(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]
     return result_blocks
 
 
-def normalize_to_matmul(sch: tir.Schedule,
-                        main_block: BlockRV,
-                        layout: list[str] | None = None) -> tir.Schedule | None:
+def normalize_to_matmul(sch: tir.Schedule, main_block: BlockRV, layout: list[str] | None = None) -> tir.Schedule | None:
     if layout is None:
         layout = ["n", "t", "n"]
     block_stmt = sch.get(main_block)
@@ -526,7 +515,7 @@ def get_tensorized_func_and_tags(
     allow_gemv: bool = False,
 ) -> tuple[tir.PrimFunc, dict[str, list[int] | int]]:
     """
-        transform function to matmul if necessary (e.g. transform conv2d with im2col)
+    transform function to matmul if necessary (e.g. transform conv2d with im2col)
     """
     if layout is None:
         layout = ["a", "a", "a"]
@@ -543,10 +532,7 @@ def get_tensorized_func_and_tags(
         conditions = []
         conditions.append(len(block_stmt.reads) == 2)
         conditions.append(len(block_stmt.writes) == 1)
-        conditions.append(
-            len(
-                collect_block_iter_vars_used_in_access_region(block_stmt,
-                                                              block_stmt.writes[0].region)) > 0)
+        conditions.append(len(collect_block_iter_vars_used_in_access_region(block_stmt, block_stmt.writes[0].region)) > 0)
         return all(conditions)
 
     # step2. transform function to tensorcore matmul (e.g. conv2d with im2col)
@@ -592,10 +578,7 @@ def get_tensorized_func_and_tags(
             return axes
 
         def is_common_reduce(var: Var) -> bool:
-            for iter_var in block_stmt.iter_vars:
-                if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                    return True
-            return False
+            return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block_stmt.iter_vars)
 
         def has_common_reduce(var: Var) -> bool:
             vars = collect_vars_from_expr(var)
@@ -626,7 +609,7 @@ def get_tensorized_func_and_tags(
         # When the func is a dequantize like ops, we should consider the M
         require_block_reduce = False
         # And we only support float16 for now
-        if (hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]):
+        if hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]:
             for arg in func.params:
                 inp_shape = func.buffer_map[arg].shape
                 M = inp_shape[0]
@@ -645,9 +628,7 @@ def get_tensorized_func_and_tags(
     if target.kind.name == "cuda" and check_sm_version(target.arch) >= 70:
         in_dtype, out_dtype = get_in_out_dtypes(block_stmt)
         if not is_tensorcore_supported_precision(in_dtype, out_dtype, arch=get_arch(target)):
-            logger.debug(
-                f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore"
-            )
+            logger.debug(f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore")
             return func, None
 
         # reindex and transform functions
@@ -676,7 +657,7 @@ def get_tensorized_func_and_tags(
             else:
                 raise ValueError(f"Unknown IterVar type {iter_type}")
 
-            if (isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold):
+            if isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold:
                 return func, None
         tags = analysis_tensorcore_tags(sch, main_block, target)
         return sch.mod["main"], tags
@@ -686,8 +667,10 @@ def get_tensorized_func_and_tags(
 
 def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", index_dtype="int32"):
     from bitblas.tl.mma_layout import (  # pylint: disable=import-outside-toplevel
-        ldmatrix_32x8_to_shared_16x16_layout, ldmatrix_trans_32x8_to_shared_16x16_layout,
-        ldmatrix_32x16_to_shared_16x32_layout_a, ldmatrix_32x16_to_shared_16x32_layout_b,
+        ldmatrix_32x8_to_shared_16x16_layout,
+        ldmatrix_trans_32x8_to_shared_16x16_layout,
+        ldmatrix_32x16_to_shared_16x32_layout_a,
+        ldmatrix_32x16_to_shared_16x32_layout_b,
     )
 
     assert dtype in [
@@ -727,9 +710,7 @@ def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", inde
         return ldmatrix_layout(thread_id, local_id)
 
     if dtype in ["bfloat16", "float16"]:
-        ldmatrix_index_map = (
-            ldmatrix_trans_permutation_16x16_32x8_16x16
-            if trans else ldmatrix_permutation_16x16_32x8_16x16)
+        ldmatrix_index_map = ldmatrix_trans_permutation_16x16_32x8_16x16 if trans else ldmatrix_permutation_16x16_32x8_16x16
     else:
         ldmatrix_index_map = ldmatrix_permutation_16x32_32x16_32x16
 
@@ -744,7 +725,6 @@ def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", inde
 # Ladder weight propagation, which can be used to avoid the ldmatrix
 # Instructions.
 def get_ladder_stage3_map(dtype="float16", index_dtype="int32"):
-
     def shared_32x8_to_mma_32x8_layout(i, j):
         thread_id = (i % 8) * 4 + (j // 2)
         local_id = (i // 8) * 2 + (j % 2)
@@ -837,8 +817,7 @@ def layout_propagate_chain(
                 scaling_factor = 1
                 for i, j in zip(write.buffer.shape, read.buffer.shape):
                     scaling_factor *= i // j
-                final_indices = list(
-                    index_map.map_indices(tmp_index_map.map_indices(write_indices)))
+                final_indices = list(index_map.map_indices(tmp_index_map.map_indices(write_indices)))
                 final_indices[-1] = final_indices[-1] // scaling_factor
                 index_map = IndexMap(
                     write_indices,
diff --git a/tilelang/carver/roller/bestfit.py b/tilelang/carver/roller/bestfit.py
index b66ceaae7eb0bd9fef216d7a5372fa77a941d45d..ec7817429d8dd2fa79db1a33988086758556f19f 100644
--- a/tilelang/carver/roller/bestfit.py
+++ b/tilelang/carver/roller/bestfit.py
@@ -2,7 +2,6 @@
 
 
 class Block:
-
     def __init__(self, start, end, is_free):
         self.start = start
         self.end = end
@@ -21,7 +20,6 @@ class Block:
 
 
 class BestFit:
-
     def __init__(self, align=32):
         self.limit = 0
         self.list = []
@@ -31,16 +29,14 @@ class BestFit:
         size = (size + self.align - 1) // self.align * self.align
         found = None
         for block in self.list:
-            if block.is_free and block.size() >= size and (not found or
-                                                           found.size() > block.size()):
+            if block.is_free and block.size() >= size and (not found or found.size() > block.size()):
                 found = block
         if found:
             found.is_free = False
             remain = found.size() - size
             if remain != 0:
                 found.end -= remain
-                self.list.insert(
-                    self.list.index(found) + 1, Block(found.end, found.end + remain, True))
+                self.list.insert(self.list.index(found) + 1, Block(found.end, found.end + remain, True))
             return found
         elif len(self.list) > 0 and self.list[-1].is_free:
             add = size - self.list[-1].size()
diff --git a/tilelang/carver/roller/hint.py b/tilelang/carver/roller/hint.py
index 20d62f68fa614b9c1552b37de9977d3bc66300d8..8fd1fb40652f855fe22ea1271da6af550549200e 100644
--- a/tilelang/carver/roller/hint.py
+++ b/tilelang/carver/roller/hint.py
@@ -1,5 +1,5 @@
 """Hint definition for schedule"""
-from __future__ import annotations
+
 from tvm import DataType
 from . import PrimFuncNode
 import numpy as np
@@ -61,7 +61,7 @@ class Stride:
             strided_elem = original_shape
         else:
             assert self.ax < len(shape)
-            strided_elem = np.prod(shape[0:self.ax + 1]) * self.stride
+            strided_elem = np.prod(shape[0 : self.ax + 1]) * self.stride
             assert strided_elem >= original_shape
         return int(strided_elem)
 
@@ -218,7 +218,7 @@ class Hint:
         return dic
 
     @classmethod
-    def from_dict(cls, dic: dict) -> Hint:
+    def from_dict(cls, dic: dict) -> "Hint":
         hint = cls()
         for k, v in dic.items():
             setattr(hint, k, v)
diff --git a/tilelang/carver/roller/node.py b/tilelang/carver/roller/node.py
index f9e38b168a80b7cbbfdb099ee3e0d70a159589f7..3122c7b078ad982ffbc8bc99d62580ac6617474d 100644
--- a/tilelang/carver/roller/node.py
+++ b/tilelang/carver/roller/node.py
@@ -1,4 +1,5 @@
 """PrimFunc Wrapper and Block information Analaysis"""
+
 from __future__ import annotations
 
 import tvm
@@ -31,7 +32,6 @@ def pre_order_traverse(block_analyzer, blocks, func):
 
 
 class BlockAnalyzer:
-
     def __init__(self, sch) -> None:
         self.sch: tir.Schedule = sch
         self.block_infos: list[BlockInfo] = normalize_prim_func(self.sch)
@@ -92,7 +92,6 @@ class Edge:
 
 
 class Node:
-
     def __init__(self, tags: dict | None = None, name: str = "Node") -> None:
         self.name = name
         if tags is None:
@@ -177,7 +176,6 @@ class Node:
 
 
 class PlaceHolderNode(Node):
-
     def __init__(self, name=""):
         super().__init__(name="PlaceHolder_" + name)
 
@@ -189,11 +187,7 @@ class PlaceHolderNode(Node):
 
 
 class PrimFuncNode(Node):
-
-    def __init__(self,
-                 prim_func: PrimFunc,
-                 tags: dict | None = None,
-                 name: str = "PrimFuncNode") -> None:
+    def __init__(self, prim_func: PrimFunc, tags: dict | None = None, name: str = "PrimFuncNode") -> None:
         super().__init__(tags, name=name)
         self.prim_func = self._specialize_func(prim_func)
         self.sch: tir.Schedule = tir.Schedule(self.prim_func)
@@ -227,7 +221,7 @@ class PrimFuncNode(Node):
         for dst_id, n in enumerate(inputs):
             if isinstance(n, Node):
                 n = (n, 0)
-            assert (len(n) == 2)
+            assert len(n) == 2
             src_node, src_id = n[0], n[1]
             edge = Edge(src_node, self, src_id, dst_id)
             self._in_edges.append(edge)
@@ -338,9 +332,8 @@ class PrimFuncNode(Node):
         if rstep is None:
             rstep = {}
         shape = {
-            self.block_analyzer.get_output_buffers(block)[0].name: [
-                tvm.arith.ConstIntBound(0, val - 1) for val in tile
-            ] for block in self.schedule_stages
+            self.block_analyzer.get_output_buffers(block)[0].name: [tvm.arith.ConstIntBound(0, val - 1) for val in tile]
+            for block in self.schedule_stages
         }
         return self.ana.infer(shape, rstep, targets)
 
@@ -356,10 +349,7 @@ class PrimFuncNode(Node):
                 results.append(shapes[arg.name])
                 continue
             # should not exceed original shape
-            trimmed_shape = [
-                self.extent_wrapper(i)
-                for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))
-            ]
+            trimmed_shape = [self.extent_wrapper(i) for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -380,10 +370,8 @@ class PrimFuncNode(Node):
             propagate_shape = shapes[arg.name]
             buffer_shape = args[i].shape
             if len(buffer_shape) > len(propagate_shape):
-                buffer_shape = buffer_shape[-len(propagate_shape):]
-            trimmed_shape = [
-                self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))
-            ]
+                buffer_shape = buffer_shape[-len(propagate_shape) :]
+            trimmed_shape = [self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -412,10 +400,7 @@ class PrimFuncNode(Node):
     def get_reduce_inputs_dtype(self):
         if self.reduction_block is None:
             return {}
-        return {
-            b.name: tvm.DataType(b.dtype)
-            for b in self.block_analyzer.get_input_buffers(self.reduction_block)
-        }
+        return {b.name: tvm.DataType(b.dtype) for b in self.block_analyzer.get_input_buffers(self.reduction_block)}
 
     @functools.lru_cache
     def infer_tensorcore_axis(self) -> tuple[int]:
@@ -425,8 +410,7 @@ class PrimFuncNode(Node):
         C_ax_m, C_ax_n = self.get_tag("tensorcore_config")
         wmma_m, wmma_n, wmma_k = [16, 16, 16]  # just for testing, any number is ok
 
-        output_buffer_shape = (
-            self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape)
+        output_buffer_shape = self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape
         valid_region = []
         for region in output_buffer_shape:
             if region.value == 1:
@@ -438,8 +422,7 @@ class PrimFuncNode(Node):
 
         def get_cl_shapes(c_ax_m, c_ax_n, num_nvalid_regions):
             spatial_dim = self.get_space_dim()
-            assert len(valid_region) == len(
-                spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
+            assert len(valid_region) == len(spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
             cl_shapes = [1] * len(spatial_dim)
             cl_shapes[c_ax_m - num_nvalid_regions] = wmma_m
             cl_shapes[c_ax_n - num_nvalid_regions] = wmma_n
@@ -467,9 +450,11 @@ class PrimFuncNode(Node):
         shapes, _ = self.propagate(shape, rstep)
 
         def is_broadcast_pattern(buffer, output_buffer):
-            return (buffer in self.args and
-                    len(shapes[output_buffer.name]) > len(shapes[buffer.name]) and
-                    np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name]))
+            return (
+                buffer in self.args
+                and len(shapes[output_buffer.name]) > len(shapes[buffer.name])
+                and np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name])
+            )
 
         def is_after_reduce_stage(block):
             if not self.reduction_block:
@@ -491,8 +476,8 @@ class PrimFuncNode(Node):
             output_buffer = self.block_analyzer.get_output_buffers(block)[0]
             for buffer in self.block_analyzer.get_input_buffers(block):
                 cache = buffer.name not in cached_tensor and (
-                    is_broadcast_pattern(buffer, output_buffer) or
-                    self.block_analyzer.get_block_info(block).is_reduction())
+                    is_broadcast_pattern(buffer, output_buffer) or self.block_analyzer.get_block_info(block).is_reduction()
+                )
                 if not cache:
                     continue
                 cached_tensor.append(buffer.name)
@@ -500,8 +485,7 @@ class PrimFuncNode(Node):
                     continue  # cache after reduce op can often reuse buffer in reduce stage
 
                 if buffer.name in stride_map:
-                    num_elem = stride_map[buffer.name].compute_elements_from_shape(
-                        shapes[buffer.name])
+                    num_elem = stride_map[buffer.name].compute_elements_from_shape(shapes[buffer.name])
                 else:
                     num_elem = np.prod(shapes[buffer.name])
                 buffer_len = num_elem * int((tvm.DataType(buffer.dtype).bits + 7) // 8)
@@ -514,7 +498,6 @@ class PrimFuncNode(Node):
 
 
 class OutputNode(Node):
-
     def __init__(self, node, id=0):
         super().__init__(name="OutputNode")
         # connect node and output node
@@ -549,15 +532,16 @@ def topo_order(list_of_nodes) -> list[Node]:
                 input_ready_count[dst_node] = len(dst_node.inputs)
                 list_of_nodes.append(dst_node)
             input_ready_count[dst_node] -= 1
-            assert (input_ready_count[dst_node] >= 0)
+            assert input_ready_count[dst_node] >= 0
             if input_ready_count[dst_node] == 0:
                 ready.append(dst_node)
-    assert (len(list_of_nodes) == len(output_list))
+    assert len(list_of_nodes) == len(output_list)
     return output_list
 
 
 def find_topo_sort_priority(output_node_list) -> list[Node]:
     import sys
+
     sys.setrecursionlimit(10000)
 
     def topo_sort_get_layer(node, topo_layer):
@@ -576,9 +560,7 @@ def find_topo_sort_priority(output_node_list) -> list[Node]:
         if node in visited:
             return
         visited.add(node)
-        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs],
-                                     key=lambda n: topo_layer[n],
-                                     reverse=True)
+        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs], key=lambda n: topo_layer[n], reverse=True)
         for n in ordered_input_nodes:
             topo_sort_dfs(n, visited, topo_order)
         topo_order.append(node)
@@ -591,7 +573,6 @@ def find_topo_sort_priority(output_node_list) -> list[Node]:
 
 
 def find_topo_sort(output_node_list) -> list[Node]:
-
     def topo_sort_dfs(node, visited, topo_order):
         if node in visited:
             return
diff --git a/tilelang/carver/roller/policy/common.py b/tilelang/carver/roller/policy/common.py
index 747dddbb003f7df00f70d1804cf477ddda6120a6..fb33eefdb7565528e7707811bb9d07dc5c7f9c7a 100644
--- a/tilelang/carver/roller/policy/common.py
+++ b/tilelang/carver/roller/policy/common.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import numpy as np
 
 
diff --git a/tilelang/carver/roller/policy/default.py b/tilelang/carver/roller/policy/default.py
index 161df27a7547a505faa35201dccb509f09d8c457..d09216e1ceb4ce7eb02a4643d18fac4990a01a34 100644
--- a/tilelang/carver/roller/policy/default.py
+++ b/tilelang/carver/roller/policy/default.py
@@ -1,4 +1,5 @@
 """Policy for cuda core schedule"""
+
 from __future__ import annotations
 import functools
 import math
@@ -36,20 +37,14 @@ class DefaultPolicy:
         self.rasterization = NoRasterization()
 
     @classmethod
-    def from_prim_func(cls,
-                       func: tvm.tir.PrimFunc,
-                       arch: TileDevice,
-                       tags: dict | None = None,
-                       name: str = "PrimFuncNode"):
+    def from_prim_func(cls, func: tvm.tir.PrimFunc, arch: TileDevice, tags: dict | None = None, name: str = "PrimFuncNode"):
         return cls(arch, tags)._init_with_prim_func(func, name)
 
     @classmethod
     def from_output_nodes(cls, nodes: list[OutputNode], arch: TileDevice, tags: dict | None = None):
         return cls(arch, tags)._init_with_output_nodes(nodes)
 
-    def _init_with_prim_func(self,
-                             func: tvm.tir.PrimFunc,
-                             name: str = "PrimFuncNode") -> DefaultPolicy:
+    def _init_with_prim_func(self, func: tvm.tir.PrimFunc, name: str = "PrimFuncNode") -> DefaultPolicy:
         if func is not None and isinstance(func, tvm.tir.PrimFunc):
             self.func = func
             self.prim_func_node = PrimFuncNode(self.func, tags=self.tags, name=name)
@@ -60,9 +55,7 @@ class DefaultPolicy:
         return self
 
     def _init_with_output_nodes(self, output_nodes: list[OutputNode]):
-        self.ordered_nodes = list(
-            filter(lambda n: not n.is_placeholder() and not n.is_output(),
-                   find_topo_sort(output_nodes)))
+        self.ordered_nodes = list(filter(lambda n: not n.is_placeholder() and not n.is_output(), find_topo_sort(output_nodes)))
         for node in self.ordered_nodes:
             node.update_tags(self.tags)
 
@@ -102,13 +95,14 @@ class DefaultPolicy:
 
     def dfs_smem_tile(self, init_tile, rstep_map) -> Iterable[TileDict]:
         _steps = [get_all_factors(n) for n in self.output_nodes[0].get_space_dim()]
-        steps = [step[step.index(t):] for step, t in zip(_steps, init_tile)]
+        steps = [step[step.index(t) :] for step, t in zip(_steps, init_tile)]
         for i in range(len(steps)):
             added = list(
                 filter(
                     lambda s: s < steps[i][-1] and s > steps[i][0] and s not in steps[i],
                     [2, 4, 8, 16, 32],
-                ))
+                )
+            )
             steps[i].extend(added)
             steps[i] = sorted(steps[i])
         visited_tiles = {}
@@ -190,10 +184,7 @@ class DefaultPolicy:
         """
         tile_map = {}
         for node in self.output_nodes:
-            tile_map[node] = [
-                tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i]
-                for i in range(len(tile))
-            ]
+            tile_map[node] = [tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i] for i in range(len(tile))]
         return tile_map
 
     def compute_workload_per_item(self, output_tile) -> float:
@@ -304,8 +295,7 @@ class DefaultPolicy:
             score = 0
             shape = node.propagate_inputs(tile, rstep=rstep)
             for i, input_buffer in enumerate(node.input_buffers):
-                read_transaction_elements = self.arch.transaction_size[1] // (
-                    (node.get_buffer_dtype(input_buffer).bits + 7) // 8)
+                read_transaction_elements = self.arch.transaction_size[1] // ((node.get_buffer_dtype(input_buffer).bits + 7) // 8)
                 score += sim(
                     int(coalesced_factor(shape[i], input_buffer.shape)),
                     read_transaction_elements,
@@ -380,17 +370,13 @@ class DefaultPolicy:
                     return None
                 return max(candidates, key=lambda x: x[1])[0]
 
-            cur_rstep_id = {
-                k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-            }
+            cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
             new_rstep_map = rstep_map.copy()
             while True:
                 new_rstep_id = _enlarge(cur_rstep_id)
                 if new_rstep_id is None:
                     break
-                new_rstep_map[node] = {
-                    k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis
-                }
+                new_rstep_map[node] = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                 old_rstep_map = td.rstep_map
                 td.rstep_map = new_rstep_map
                 smem_usage, _ = self._compute_shared_memory_usage(td)
@@ -434,15 +420,14 @@ class DefaultPolicy:
                 if edge.src_node.is_placeholder():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     read_transaction_elements = self.arch.transaction_size[1] // nbytes
-                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(),
-                                                      read_transaction_elements) * nbytes
+                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(), read_transaction_elements) * nbytes
             for edge in node.outputs:
                 if edge.dst_node.is_output():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     write_transaction_elements = self.arch.transaction_size[0] // nbytes
-                    traffic += coalesced_tensor_shape(output_shapes[edge.src_id],
-                                                      node.get_shape(edge.src_id),
-                                                      write_transaction_elements) * nbytes
+                    traffic += (
+                        coalesced_tensor_shape(output_shapes[edge.src_id], node.get_shape(edge.src_id), write_transaction_elements) * nbytes
+                    )
 
         return traffic, op_tile_map
 
@@ -487,10 +472,7 @@ class DefaultPolicy:
         cached_tensors_map = {}
 
         def can_free(node, out_id):
-            for edge in node.outputs:
-                if edge.src_id == out_id and edge.dst_node not in processed:
-                    return False
-            return True
+            return all(not (edge.src_id == out_id and edge.dst_node not in processed) for edge in node.outputs)
 
         for node in self.ordered_nodes:
             node_internal_bytes, cached_tensors_map[node] = self.infer_node_smem_usage(td, node)
@@ -528,9 +510,7 @@ class DefaultPolicy:
         Tuple[Dict, Dict]
             A tuple of dictionaries containing the output strides and tensor strides.
         """
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         return output_strides, tensor_strides
 
@@ -551,8 +531,7 @@ class DefaultPolicy:
         output_strides_map = {}
         tensor_strides_map = {}
         for node in self.ordered_nodes:
-            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(
-                node, td)
+            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(node, td)
         td.output_strides_map, td.tensor_strides_map = output_strides_map, tensor_strides_map
 
     def compute_tile_dict(self, output_tile: list[int], rstep_map) -> TileDict:
@@ -582,9 +561,7 @@ class DefaultPolicy:
         output_shape = self.output_nodes[0].get_space_dim()
         td.grid_size = int(np.prod([(y + x - 1) // x for x, y in zip(output_tile, output_shape)]))
         # estimated reg usage
-        reg_usage = int(2 * max([
-            np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes
-        ]))
+        reg_usage = int(2 * max([np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes]))
         if reg_usage > self.arch.reg_cap:
             td.valid = False
             return td
@@ -609,13 +586,10 @@ class DefaultPolicy:
         for node in self.ordered_nodes:
             if np.prod(td.get_tile(node)) == 0:
                 return False
-            node_grid_size = np.prod([
-                (y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())
-            ])
+            node_grid_size = np.prod([(y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())])
             if node_grid_size != td.grid_size:
                 return False
-            if (hasattr(node, "reduce_op") and node.reduce_op is not None and
-                    len(node.reduce_op.axis) == len(td.output_tile)):
+            if hasattr(node, "reduce_op") and node.reduce_op is not None and len(node.reduce_op.axis) == len(td.output_tile):
                 for i, tile_extent in enumerate(td.output_tile):
                     if node.reduce_op.axis[i].dom.extent % tile_extent:
                         return False
@@ -639,23 +613,22 @@ class DefaultPolicy:
         node_space_sizes = [int(np.prod(td.get_tile(node))) for node in self.ordered_nodes]
         max_block_size = functools.reduce(math.gcd, node_space_sizes)
 
-        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(
-                node_space_sizes):
-            node_reduce_sizes = [
-                int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes
-            ]
+        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(node_space_sizes):
+            node_reduce_sizes = [int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes]
             total_sizes = [x * y for x, y in zip(node_space_sizes, node_reduce_sizes)]
             max_possible_size = functools.reduce(math.gcd, total_sizes)
             possible_block_sizes = list(
                 filter(
                     lambda x: x % max_block_size == 0 and x <= 1024,
                     get_all_factors(max_possible_size),
-                ))
+                )
+            )
             possible_block_sizes = list(
                 filter(  # either be a factor of space or cover fully cover the space
                     lambda x: all([x % s == 0 or s % x == 0 for s in node_space_sizes]),
                     possible_block_sizes,
-                ))
+                )
+            )
             factor_ordered = sorted(possible_block_sizes, key=self.score_block_size)
             return factor_ordered
         else:
@@ -821,8 +794,7 @@ class DefaultPolicy:
         vectorize_result = {}
         for tensor, shape in shapes.items():
             for v in vectorize_sizes:
-                if (is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and
-                        is_type_allowed(dtypes[tensor], v)):
+                if is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and is_type_allowed(dtypes[tensor], v):
                     vectorize_result[tensor] = v
                     break
         return vectorize_result
diff --git a/tilelang/carver/roller/policy/tensorcore.py b/tilelang/carver/roller/policy/tensorcore.py
index 15bad4122f049265594fb6799a9777bf497543e6..86c79ea732a374847fa5ca79e88c4508596a282e 100644
--- a/tilelang/carver/roller/policy/tensorcore.py
+++ b/tilelang/carver/roller/policy/tensorcore.py
@@ -1,4 +1,5 @@
 """Policy for tensorcore schedule"""
+
 from __future__ import annotations
 import tvm
 import numpy as np
@@ -13,7 +14,6 @@ logger = logging.getLogger(__name__)
 
 
 class TensorCorePolicy(DefaultPolicy):
-
     # this is the trick for wmma.
     # However, for int8 mma, the wmma_k should be 32.
     wmma_k: int = 16
@@ -70,9 +70,9 @@ class TensorCorePolicy(DefaultPolicy):
         A_high_ax = min(A_ax_m, A_ax_k)
         B_high_ax = min(B_ax_n, B_ax_k)
         C_high_ax = min(C_ax_m, C_ax_n)
-        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1:]) + offset, ax=A_high_ax)
-        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1:]) + offset, ax=B_high_ax)
-        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1:]) + offset, ax=C_high_ax)
+        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1 :]) + offset, ax=A_high_ax)
+        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1 :]) + offset, ax=B_high_ax)
+        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1 :]) + offset, ax=C_high_ax)
         return A_stride, B_stride, C_stride
 
     def infer_node_smem_usage(self, td: TileDict, node: PrimFuncNode):
@@ -86,8 +86,7 @@ class TensorCorePolicy(DefaultPolicy):
         # get reduce input size
         target_transaction = self.arch.transaction_size[0] * 2
         # 512 bytes // type bits
-        reduce_input_dtype = node.get_buffer_dtype(
-            node.block_analyzer.get_input_buffers(node.reduction_block)[0])
+        reduce_input_dtype = node.get_buffer_dtype(node.block_analyzer.get_input_buffers(node.reduction_block)[0])
         basic = (target_transaction * 8) // reduce_input_dtype.bits
 
         result = {}
@@ -95,7 +94,7 @@ class TensorCorePolicy(DefaultPolicy):
             iter_name = iter_info.var.name
             iter_dom = iter_info.dom.extent
             if iter_dom % 16 > 0:
-                result[iter_name] = (16 if iter_dom < basic else basic)  # for the case of padding
+                result[iter_name] = 16 if iter_dom < basic else basic  # for the case of padding
             elif iter_dom % basic == 0:
                 result[iter_name] = basic
             else:
@@ -114,7 +113,6 @@ class TensorCorePolicy(DefaultPolicy):
             return False
 
         if _check_small_tile(td):
-
             smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap)
             rstep_map = td.rstep_map.copy()
 
@@ -127,13 +125,10 @@ class TensorCorePolicy(DefaultPolicy):
                     return rstep
 
                 def _shared_memory_usage(td: TileDict):
-                    return node.footprint(td.output_tile, new_rstep_map,
-                                          td.tensor_strides_map[node])
+                    return node.footprint(td.output_tile, new_rstep_map, td.tensor_strides_map[node])
 
                 def _score(rstep_id):
-                    rstep = {
-                        k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis
-                    }
+                    rstep = {k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis}
                     score = 0
                     shape = node.propagate_inputs_on_reduction(td.get_tile(node), rstep=rstep)
                     input_buffers = node.block_analyzer.get_input_buffers(node.reduction_block)
@@ -153,18 +148,13 @@ class TensorCorePolicy(DefaultPolicy):
                         return None
                     return max(candidates, key=lambda x: x[1])[0]
 
-                cur_rstep_id = {
-                    k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-                }
+                cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
                 new_rstep_map = rstep_map.copy()
                 while True:
                     new_rstep_id = _enlarge(cur_rstep_id)
                     if new_rstep_id is None:
                         break
-                    new_rstep_map = {
-                        k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]]
-                        for k in node.raxis
-                    }
+                    new_rstep_map = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                     old_rstep_map = td.rstep_map
                     td.rstep_map = new_rstep_map
                     smem_usage, _ = _shared_memory_usage(td)
@@ -173,9 +163,7 @@ class TensorCorePolicy(DefaultPolicy):
                         break
                     else:
                         cur_rstep_id = new_rstep_id
-                rstep = {
-                    k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis
-                }
+                rstep = {k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis}
                 return rstep
 
             for node in self.ordered_nodes:
@@ -206,11 +194,7 @@ class TensorCorePolicy(DefaultPolicy):
             return super().get_node_reduce_step_candidates(node)
         else:
             # must be a a multiple of wmma_k
-            return {
-                k.var.name: [
-                    x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)
-                ] for k in node.raxis
-            }
+            return {k.var.name: [x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)] for k in node.raxis}
 
     def check_tile_shape_isvalid(self, td: TileDict):
         for node in self.ordered_nodes:
@@ -221,10 +205,7 @@ class TensorCorePolicy(DefaultPolicy):
                     td.tile_map[node][ax_n],
                 )
                 # check the tile size is valid
-                wmma_invalid = [
-                    block_m < wmma_m or block_n < wmma_n
-                    for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()
-                ]
+                wmma_invalid = [block_m < wmma_m or block_n < wmma_n for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()]
                 if all(wmma_invalid):
                     return False
                 if any([y % x for x, y in zip(td.tile_map[node], node.get_space_dim())]):
@@ -242,13 +223,10 @@ class TensorCorePolicy(DefaultPolicy):
             return super().compute_node_stride_map(node, td)
         use_layout = self._can_implement_layout(node, td)
 
-        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node),
-                                                                  td.get_rstep(node))
+        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node), td.get_rstep(node))
         A_stride, B_stride, _ = self._compute_tc_strides(node, td.get_tile(node))
         tensor_strides = {}
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         # when connected to shared input, should use full stride without rstep
         for i, (_, _) in enumerate(zip([AS_stride, BS_stride], [A_stride, B_stride])):
@@ -347,8 +325,7 @@ class TensorCorePolicy(DefaultPolicy):
             overall_gmem_size_in_bytes: int = 0
             for node in self.ordered_nodes:
                 for buffer in node.input_buffers:
-                    overall_gmem_size_in_bytes += (
-                        int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8)
+                    overall_gmem_size_in_bytes += int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8
             return overall_gmem_size_in_bytes < self.arch.l2_cache_size_bytes
 
         conditions.append(_check_memory_size())
diff --git a/tilelang/carver/roller/rasterization.py b/tilelang/carver/roller/rasterization.py
index 39c603b6b209f03af2f3878de2dae0d21e162c3c..ec565a1c7c29cbc1a7c11191fbb8d248a5368d32 100644
--- a/tilelang/carver/roller/rasterization.py
+++ b/tilelang/carver/roller/rasterization.py
@@ -1,9 +1,7 @@
 """Rasteration Plan For L2 Cache Locality"""
-from __future__ import annotations
 
 
 class Rasterization:
-
     panel_width_ = None
 
     def __init__(self) -> None:
@@ -19,7 +17,6 @@ class Rasterization:
 
 
 class NoRasterization(Rasterization):
-
     def __init__(self) -> None:
         super().__init__()
 
diff --git a/tilelang/carver/roller/shape_inference/common.py b/tilelang/carver/roller/shape_inference/common.py
index aaf59aed9b0d5730ace571e5ac2ad592e377cc54..c29ae4129831ab6a4255dc054c5b6b36f9bac50c 100644
--- a/tilelang/carver/roller/shape_inference/common.py
+++ b/tilelang/carver/roller/shape_inference/common.py
@@ -1,13 +1,10 @@
-from __future__ import annotations
 from collections import OrderedDict
 
 from tvm import arith
 
 
 class Statement:
-
-    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict,
-                 range_map: OrderedDict):
+    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict, range_map: OrderedDict):
         self.output = output
         self.dependent_region = dependent_region
         self.var_map = var_map
@@ -19,7 +16,6 @@ def _merge_two_bounds(x: arith.ConstIntBound, y: arith.ConstIntBound):
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
 
diff --git a/tilelang/carver/roller/shape_inference/tir.py b/tilelang/carver/roller/shape_inference/tir.py
index 675298c69fc8809b058f5e5016cf6ea23bbbf442..d7b11d6086720f6e24d3dbcca981b455593a6221 100644
--- a/tilelang/carver/roller/shape_inference/tir.py
+++ b/tilelang/carver/roller/shape_inference/tir.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from collections.abc import Mapping
 from tvm.tir.schedule.schedule import BlockRV
 from tvm.ir import structural_equal
@@ -6,7 +5,6 @@ from tvm import arith, tir
 
 
 class Statement:
-
     def __init__(self, block_analyzer, block: BlockRV):
         self.block_analyzer = block_analyzer
         self.block = block
@@ -22,9 +20,7 @@ class Statement:
         if len(self.dependent_region[input_name]) != 1:
             return None
         indices = self.dependent_region[input_name][0]
-        iter_map_range = {
-            _iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)
-        }
+        iter_map_range = {_iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)}
         iter_map_result = arith.detect_iter_map(
             indices,
             iter_map_range,
@@ -78,7 +74,6 @@ class TensorDepNode:
 
 
 class DependencyAnalysis:
-
     def __init__(self, deps):
         self.deps = deps
         # issue: duplicate name when we have two same ops.
@@ -113,8 +108,7 @@ class DependencyAnalysis:
 
     def traverse_dependencies(self, compute):
         if isinstance(compute, Statement):
-            node = self.get_or_create_node(
-                compute.block_analyzer.get_output_buffers(compute.block)[0].name)
+            node = self.get_or_create_node(compute.block_analyzer.get_output_buffers(compute.block)[0].name)
             # Loop through input tensors
             for input_buffer in compute.block_analyzer.get_input_buffers(compute.block):
                 # Get the input node
@@ -168,7 +162,6 @@ class DependencyAnalysis:
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
         self.target_mapping = {}
@@ -184,16 +177,11 @@ class InputShapeInference:
         if targets in self.target_mapping:
             return self.target_mapping[targets]
         # should be buffer name instead of block name
-        name2dep = {
-            dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps
-        }
+        name2dep = {dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps}
         mapping = {}
         input_vars = []
         for target in targets:
-            vars = [
-                iter.var
-                for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)
-            ]
+            vars = [iter.var for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)]
             input_vars.append(vars)
             mapping[target] = [vars]
         ana = arith.Analyzer()
@@ -222,13 +210,8 @@ class InputShapeInference:
                     mapping[input_name] = []
                 for indices in indices_list:
                     for region in regions:
-                        vmap = {
-                            k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v)
-                            for k, v in zip(ax_vars, indices)
-                        }
-                        region = [
-                            ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region
-                        ]
+                        vmap = {k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v) for k, v in zip(ax_vars, indices)}
+                        region = [ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region]
                         if not region_exist_in_list(region, mapping[input_name]):
                             mapping[input_name].append(region)
         buffers = []
@@ -242,10 +225,7 @@ class InputShapeInference:
         self.target_mapping[targets] = input_vars, mapping
         return input_vars, mapping
 
-    def infer(self,
-              shape: dict[str, list[arith.ConstIntBound]],
-              rstep: dict[str, int] = None,
-              targets=None):
+    def infer(self, shape: dict[str, list[arith.ConstIntBound]], rstep: dict[str, int] = None, targets=None):
         if rstep is None:
             rstep = {}
         compute_targets = tuple(shape.keys())
@@ -259,8 +239,7 @@ class InputShapeInference:
         for ax in self.reduce_axes:
             # assume the dom.min is always 0, maybe we can extend the IterInfo to include the min value.
             if ax.var.name in rstep:
-                bound = arith.ConstIntBound(
-                    int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
+                bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
             else:
                 bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + ax.dom.extent - 1))
             ana.update(ax.var, bound, True)
@@ -313,14 +292,11 @@ class InputShapeInference:
 
         for name, regions in mapping.items():
             region = regions[0]
-            result[name] = [
-                ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region
-            ]
+            result[name] = [ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region]
         return result
 
 
 def region_exist_in_list(a, list) -> bool:
-
     def expr_is_same(a, b) -> bool:
         if isinstance(a, tir.IntImm) and isinstance(b, tir.IntImm):
             return a.value == b.value
diff --git a/tilelang/carver/template/base.py b/tilelang/carver/template/base.py
index 5aa5074c210b3d7771428c475d81f73156c04619..4a699fbc7dc8ec1a649d3b37e9f5bad24586e006 100644
--- a/tilelang/carver/template/base.py
+++ b/tilelang/carver/template/base.py
@@ -1,9 +1,13 @@
 # Import necessary modules and classes
-from __future__ import annotations
 from abc import ABC, abstractmethod  # For defining abstract base classes
 from dataclasses import dataclass, field  # For defining data classes
 from ..arch import (  # Import architecture-related utilities and classes
-    TileDevice, is_volta_arch, is_ampere_arch, is_cdna_arch, auto_infer_current_arch)
+    TileDevice,
+    is_volta_arch,
+    is_ampere_arch,
+    is_cdna_arch,
+    auto_infer_current_arch,
+)
 from ..roller.hint import Hint  # Import the Hint class
 from ..roller.node import OutputNode  # Import the OutputNode class
 from tvm.tir import PrimFunc  # Import PrimFunc for handling tensor IR functions
@@ -42,7 +46,7 @@ class BaseTemplate(ABC):
         """
         pass
 
-    def with_arch(self, arch: TileDevice) -> BaseTemplate:
+    def with_arch(self, arch: TileDevice) -> "BaseTemplate":
         """
         Sets the architecture for this template and returns itself.
 
@@ -110,7 +114,7 @@ class BaseTemplate(ABC):
         """
         raise NotImplementedError("initialize_function is not implemented")
 
-    def set_function(self, func: PrimFunc) -> BaseTemplate:
+    def set_function(self, func: PrimFunc) -> "BaseTemplate":
         """
         Sets the function for this template and returns itself.
 
@@ -123,7 +127,7 @@ class BaseTemplate(ABC):
         self._func = func
         return self
 
-    def set_output_nodes(self, output_nodes: list[OutputNode]) -> BaseTemplate:
+    def set_output_nodes(self, output_nodes: list[OutputNode]) -> "BaseTemplate":
         """
         Sets the output nodes for this template and returns itself.
 
diff --git a/tilelang/carver/template/conv.py b/tilelang/carver/template/conv.py
index f180084d5105a6873dba427b8edb4b8dc57079d7..c339e589488b4db57637804647c188275f0bc77b 100644
--- a/tilelang/carver/template/conv.py
+++ b/tilelang/carver/template/conv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te, tir
@@ -29,6 +28,7 @@ class ConvTemplate(BaseTemplate):
         accum_dtype (str): Data type used for accumulation.
         with_bias (bool): Whether to add a bias term.
     """
+
     # Operation-related configuration parameters
     N: int  # The number of input samples processed simultaneously in a batch.
     C: int  # The number of input feature maps.
@@ -70,12 +70,18 @@ class ConvTemplate(BaseTemplate):
             AssertionError: If N, C, H, W, F, K, S, D, P are not positive integers.
         """
         N, C, H, W, F, K, S, D, P = self.N, self.C, self.H, self.W, self.F, self.K, self.S, self.D, self.P
-        assert (isinstance(N, int) and isinstance(C, int) and isinstance(H, int) and
-                isinstance(W, int) and isinstance(F, int) and isinstance(K, int) and
-                isinstance(S, int) and isinstance(D, int) and
-                isinstance(P, int)), "Only Support Integer Params"
-        assert (N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and
-                P > 0), "Params should be positive"
+        assert (
+            isinstance(N, int)
+            and isinstance(C, int)
+            and isinstance(H, int)
+            and isinstance(W, int)
+            and isinstance(F, int)
+            and isinstance(K, int)
+            and isinstance(S, int)
+            and isinstance(D, int)
+            and isinstance(P, int)
+        ), "Only Support Integer Params"
+        assert N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and P > 0, "Params should be positive"
 
         # Load configuration parameters
         in_dtype, out_dtype, accum_dtype = self.in_dtype, self.out_dtype, self.accum_dtype
@@ -124,8 +130,10 @@ class ConvTemplate(BaseTemplate):
                 te.if_then_else(
                     te.all(h_in >= 0, h_in < H, w_in >= 0, w_in < W),
                     A[n, h_in, w_in, c].astype(accum_dtype) * B[kh, kw, c, f].astype(accum_dtype),
-                    tir.const(0, accum_dtype)),
-                axis=[kh, kw, c])
+                    tir.const(0, accum_dtype),
+                ),
+                axis=[kh, kw, c],
+            )
 
         # Compute convolution result
         C = te.compute(
diff --git a/tilelang/carver/template/elementwise.py b/tilelang/carver/template/elementwise.py
index 26d5315299ba0dcf5be4dca1885f8921490fc3f8..8cd30619802913b9c4196beceff89bf9c82d5d20 100644
--- a/tilelang/carver/template/elementwise.py
+++ b/tilelang/carver/template/elementwise.py
@@ -1,5 +1,4 @@
 # Import necessary modules
-from __future__ import annotations
 from dataclasses import dataclass  # Used for defining data classes
 from .base import BaseTemplate  # Importing the base class for templates
 from tvm import te  # Importing TVM's tensor expression module
diff --git a/tilelang/carver/template/flashattention.py b/tilelang/carver/template/flashattention.py
index 760b198177b9d647db70b1225ffcecb02cc8db67..933ab9585405ee7dd790b462775219fb0ccae8b1 100644
--- a/tilelang/carver/template/flashattention.py
+++ b/tilelang/carver/template/flashattention.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
@@ -10,7 +9,6 @@ from ..utils import get_roller_hints_from_output_nodes, get_tensorized_func_and_
 
 @dataclass
 class FlashAttentionTemplate(BaseTemplate):
-
     _output_nodes: list[OutputNode] = None
 
     # Operation-related configuration parameters
@@ -92,10 +90,7 @@ class FlashAttentionTemplate(BaseTemplate):
                 """
                 A_indices = [b, i, k]
                 B_indices = [b, j, k]
-                return te.sum(
-                    A[tuple(A_indices)].astype(accum_dtype) *
-                    B[tuple(B_indices)].astype(accum_dtype),
-                    axis=k)
+                return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
             # Compute matrix multiplication result
             C = te.compute(
diff --git a/tilelang/carver/template/gemv.py b/tilelang/carver/template/gemv.py
index 7195a0b87a3a20d974afaf084fc38ef03c51fa86..e7962f6ad76205f19df8df1676705ed6e6037d25 100644
--- a/tilelang/carver/template/gemv.py
+++ b/tilelang/carver/template/gemv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
@@ -51,9 +50,8 @@ class GEMVTemplate(BaseTemplate):
         N, K = self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_B = self.trans_B
@@ -87,9 +85,7 @@ class GEMVTemplate(BaseTemplate):
             """
             A_indices = [i, k]
             B_indices = [k, j] if not trans_B else [j, k]
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/template/general_reduce.py b/tilelang/carver/template/general_reduce.py
index a8da5fd6cebc02071cd9b98c3e10c7801fb7a259..b7a55157c2586afa6c6ddcaf51f80c6c985544f1 100644
--- a/tilelang/carver/template/general_reduce.py
+++ b/tilelang/carver/template/general_reduce.py
@@ -9,15 +9,13 @@ from ..utils import get_roller_hints_from_func
 
 @dataclass
 class GeneralReductionTemplate(BaseTemplate):
-
     # OP Related Config
     structure: str | list[str] = None
     shape: list[int] = None
     dtype: str = "float16"
 
     def get_hardware_aware_configs(self, arch: TileDevice = None, topk: int = 10) -> list[Hint]:
-        roller_hints = get_roller_hints_from_func(
-            self._func, arch=arch, topk=topk, allow_gemv=False)
+        roller_hints = get_roller_hints_from_func(self._func, arch=arch, topk=topk, allow_gemv=False)
         return roller_hints
 
     def initialize_function(self) -> None:
@@ -38,9 +36,9 @@ class GeneralReductionTemplate(BaseTemplate):
         spatial_axes = []
         reduce_axes = []
         for i, axis_type in enumerate(self.structure):
-            if axis_type.upper() == 'S':
+            if axis_type.upper() == "S":
                 spatial_axes.append((i, self.shape[i]))
-            elif axis_type.upper() == 'R':
+            elif axis_type.upper() == "R":
                 reduce_axes.append((i, self.shape[i]))
             else:
                 raise ValueError(f"Unrecognized axis type '{axis_type}', only 'S'/'R' allowed.")
@@ -90,7 +88,7 @@ class GeneralReductionTemplate(BaseTemplate):
 
             # Walk through the structure in order
             for axis_type in self.structure:
-                if axis_type.upper() == 'S':
+                if axis_type.upper() == "S":
                     # use the next spatial_indices item
                     full_index.append(spatial_indices[spatial_iter])
                     spatial_iter += 1
diff --git a/tilelang/carver/template/matmul.py b/tilelang/carver/template/matmul.py
index 4847cdb22a2a31c4b83d59e115d1e516777fbec1..57c92beb75d870cf17c491427241b1b9926ffe3d 100644
--- a/tilelang/carver/template/matmul.py
+++ b/tilelang/carver/template/matmul.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
@@ -66,9 +65,8 @@ class MatmulTemplate(BaseTemplate):
         M, N, K = self.M, self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_A, trans_B = self.trans_A, self.trans_B
@@ -102,9 +100,7 @@ class MatmulTemplate(BaseTemplate):
             """
             A_indices = [i, k] if not trans_A else [k, i]  # Adjust indexing if A is transposed
             B_indices = [k, j] if not trans_B else [j, k]  # Adjust indexing if B is transposed
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/utils.py b/tilelang/carver/utils.py
index cedb7547a0cee3eb976ad23802627d5fe4f22e78..67db89e39178fb8e1f1848a2e2dd95928cb4d6be 100644
--- a/tilelang/carver/utils.py
+++ b/tilelang/carver/utils.py
@@ -26,11 +26,9 @@ def get_rasterization_code(pannel_width: int = 8) -> str:
     """
 
 
-def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
-                               arch: TileDevice,
-                               topk: int = 10,
-                               tensorcore_only: bool = False,
-                               allow_gemv: bool = False) -> list[Hint] | None:
+def get_roller_hints_from_func(
+    func_or_module: tir.PrimFunc | IRModule, arch: TileDevice, topk: int = 10, tensorcore_only: bool = False, allow_gemv: bool = False
+) -> list[Hint] | None:
     func = None
     if isinstance(func_or_module, tir.PrimFunc):
         func = func_or_module
@@ -44,8 +42,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     roller_hints = None
     if tensorcore_only:
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -58,8 +55,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
         policy = DefaultPolicy.from_prim_func(func=func, arch=arch)
         tensorized_func = None
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -69,10 +65,9 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     return roller_hints
 
 
-def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
-                                       arch: TileDevice,
-                                       topk: int = 10,
-                                       extra_tags: list[str] | None = None) -> list[Hint] | None:
+def get_roller_hints_from_output_nodes(
+    output_nodes: list[OutputNode], arch: TileDevice, topk: int = 10, extra_tags: list[str] | None = None
+) -> list[Hint] | None:
     assert isinstance(output_nodes, list), "The input should be a list of functions."
 
     lints = []
@@ -80,8 +75,7 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
         policy = TensorCorePolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
         lints = policy.emit_config(topk)
     except Exception as e_msg:
-        logger.debug(f"Generate hints from output nodes failed: {e_msg}",
-                     "fallback to default policy")
+        logger.debug(f"Generate hints from output nodes failed: {e_msg}", "fallback to default policy")
 
     if len(lints) == 0:
         policy = DefaultPolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
@@ -92,7 +86,6 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
 def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
diff --git a/tilelang/contrib/cc.py b/tilelang/contrib/cc.py
index 0807c2552c44911fd3b63f1aec65eb97798b859c..7dc459770b06452f453a76769586e63f3b6fd57e 100644
--- a/tilelang/contrib/cc.py
+++ b/tilelang/contrib/cc.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Util to invoke C/C++ compilers in the system."""
-from __future__ import annotations
+
 import functools
 import os
 import shutil
@@ -31,8 +31,7 @@ from tvm.contrib import utils as _utils
 
 
 def _is_linux_like():
-    return (sys.platform == "darwin" or sys.platform.startswith("linux") or
-            sys.platform.startswith("freebsd"))
+    return sys.platform == "darwin" or sys.platform.startswith("linux") or sys.platform.startswith("freebsd")
 
 
 def _is_windows_like():
@@ -91,7 +90,7 @@ def get_cplus_compiler():
 
 
 def is_darwin():
-    return platform.system() == 'Darwin'
+    return platform.system() == "Darwin"
 
 
 def create_shared(output, objects, options=None, cc=None, cwd=None, ccache_env=None):
@@ -288,11 +287,7 @@ create_shared.output_format = "so" if sys.platform != "win32" else "dll"
 create_shared.get_target_triple = get_target_by_dump_machine(os.environ.get("CXX", get_cc()))
 
 
-def cross_compiler(compile_func,
-                   options=None,
-                   output_format=None,
-                   get_target_triple=None,
-                   add_files=None):
+def cross_compiler(compile_func, options=None, output_format=None, get_target_triple=None, add_files=None):
     """Create a cross compiler function by specializing compile_func with options.
 
     This function can be used to construct compile functions that
@@ -364,13 +359,7 @@ def cross_compiler(compile_func,
     return _fcompile
 
 
-def _linux_compile(output,
-                   objects,
-                   options,
-                   compile_cmd,
-                   cwd=None,
-                   ccache_env=None,
-                   compile_shared=False):
+def _linux_compile(output, objects, options, compile_cmd, cwd=None, ccache_env=None, compile_shared=False):
     cmd = [compile_cmd]
     if compile_cmd != "nvcc":
         if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
@@ -431,15 +420,15 @@ def _windows_compile(output, objects, options, cwd=None, ccache_env=None):
             raise ValueError("ccache not found")
 
     try:
-        proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("Can not find the LLVM clang for Windows clang.exe)."
-                           "Make sure it's installed"
-                           " and the installation directory is in the %PATH% environment "
-                           "variable. Prebuilt binaries can be found at: https://llvm.org/") \
-                               from None
+        raise RuntimeError(
+            "Can not find the LLVM clang for Windows clang.exe)."
+            "Make sure it's installed"
+            " and the installation directory is in the %PATH% environment "
+            "variable. Prebuilt binaries can be found at: https://llvm.org/"
+        ) from None
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += " ".join(cmd) + "\n"
diff --git a/tilelang/contrib/cutedsl/__init__.py b/tilelang/contrib/cutedsl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1028badeaacab26935754d10d0e2b3aafdebb65f
--- /dev/null
+++ b/tilelang/contrib/cutedsl/__init__.py
@@ -0,0 +1,128 @@
+import cutlass
+import cutlass.cute as cute
+from cutlass._mlir.dialects import nvvm
+from cutlass.cutlass_dsl import T
+
+# re-export cutlass.cute.arch functions first
+from cutlass.cute.arch import sync_threads  # noqa: F401
+from cutlass.cute.arch import alloc_smem, get_dyn_smem  # noqa: F401
+from cutlass.cute.arch import warpgroup_reg_alloc, warpgroup_reg_dealloc  # noqa: F401
+
+from cutlass.cute import make_tensor, make_rmem_tensor, recast_ptr  # noqa: F401
+from cutlass.cute.typing import Numeric
+
+from cutlass.base_dsl.typing import as_numeric, Int32, Uint16, Uint32  # noqa: F401
+from cutlass._mlir.dialects import llvm, arith  # noqa: F401
+from cutlass._mlir import ir as mlir_ir
+from cutlass.cutlass_dsl import dsl_user_op
+
+# Import our custom implementations (will override if names conflict)
+from .mbar import *
+from .cpasync import *
+from .gemm_V1 import *
+from .reduce import *
+from .ldsm import *
+from .math import *
+from .threadblock_swizzle import *
+
+# Forward nvvm enums
+from cutlass._mlir.dialects.nvvm import (
+    MemOrderKind,
+    MemScopeKind,
+    AtomicOpKind,
+)
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def make_filled_tensor(shape, value):
+    t = cute.make_rmem_tensor(shape, type(value))
+    t.fill(value)
+    return t
+
+
+def make_tensor_at_offset(ptr: cute.Pointer, offset, shape, div_by=1):
+    if div_by != 1:
+        offset = cute.assume(cutlass.as_numeric(offset), divby=div_by)
+    return cute.make_tensor(ptr + offset, shape)
+
+
+def shuffle_elect(thread_extent):
+    # thread_extent is the number of threads of a warpgroup
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    if thread_extent == 0:
+        return warp_idx == 0
+    else:
+        return (warp_idx % (thread_extent // 32)) == 0
+
+
+def sync_thread_partial(barrier_id=None, thread_count=None):
+    bar_sync_ptx(barrier_id, thread_count)
+
+
+# Packing functions
+def pack_half2(x, y):
+    """
+    Pack two half-precision (fp16) values into a single 32-bit value.
+    Corresponds to CUDA's __pack_half2 intrinsic.
+
+    This packs two fp16 values into a single int32 by treating the fp16 bits
+    as raw data and concatenating them.
+    """
+
+    @dsl_user_op
+    def pack_half2_impl(x_val, y_val, *, loc=None, ip=None):
+        # Cast fp16 to uint16 (bitcast)
+        x_ir = x_val.ir_value(loc=loc, ip=ip) if hasattr(x_val, "ir_value") else x_val
+        y_ir = y_val.ir_value(loc=loc, ip=ip) if hasattr(y_val, "ir_value") else y_val
+
+        # Bitcast fp16 to i16
+        i16_type = mlir_ir.IntegerType.get_signless(16)
+        x_i16 = llvm.bitcast(i16_type, x_ir, loc=loc, ip=ip)
+        y_i16 = llvm.bitcast(i16_type, y_ir, loc=loc, ip=ip)
+
+        packed_xy = llvm.inline_asm(
+            Int32.mlir_type,
+            [x_i16, y_i16],
+            "mov.b32 $0, {$1, $2};",
+            "=r,h,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+        return Int32(packed_xy)
+
+    return pack_half2_impl(x, y)
+
+
+def AtomicAdd(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    if ptr.dtype == cutlass.Float32:
+        ret = nvvm.atomicrmw(
+            T.f32(),
+            AtomicOpKind.FADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    elif ptr.dtype == cutlass.Int32:
+        ret = nvvm.atomicrmw(
+            T.i32(),
+            AtomicOpKind.ADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        raise ValueError(f"Unsupported dtype: {ptr.dtype}")
+    return ptr.dtype(ret)
diff --git a/tilelang/contrib/cutedsl/cpasync.py b/tilelang/contrib/cutedsl/cpasync.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ddeb89337161262bf06fad92f5672fcb979f16f
--- /dev/null
+++ b/tilelang/contrib/cutedsl/cpasync.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op  # noqa: F401
+
+from cutlass._mlir.dialects import nvvm, cute_nvgpu  # noqa: F401
+from cutlass._mlir import ir
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+import cutlass.cute as cute
+from cutlass.cute.typing import Int, Boolean, Int32, Int16, Uint64, Union  # noqa: F401
+from cutlass.impl_utils import check_value_in
+
+from cutlass.cute.arch import cp_async_commit_group as cp_async_commit  # noqa: F401
+from cutlass.cute.arch import cp_async_wait_group as cp_async_wait  # noqa: F401
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def cp_async_gs(size, dst, dst_offset, src, src_offset):
+    assert size in [16, 8, 4]
+    # use CG (cache global) to by pass L1 when loading contiguous 128B.
+    mode = nvvm.LoadCacheModifierKind.CG if size == 16 else nvvm.LoadCacheModifierKind.CA
+    if isinstance(src, cute.Tensor):
+        src_ptr = src.iterator
+    elif isinstance(src, cute.Pointer):
+        src_ptr = src
+    else:
+        raise ValueError(f"Invalid source type: {type(src)}")
+    if isinstance(dst, cute.Tensor):
+        dst_ptr = dst.iterator
+    elif isinstance(dst, cute.Pointer):
+        dst_ptr = dst
+    else:
+        raise ValueError(f"Invalid destination type: {type(dst)}")
+    cp_async_shared_global(dst_ptr + dst_offset, src_ptr + src_offset, size, mode)
+
+
+@cute.jit
+def cp_async_gs_conditional(size, dst, dst_offset, src, src_offset, cond):
+    if cond:
+        cp_async_gs(size, dst, dst_offset, src, src_offset)
+
+
+@dsl_user_op
+def extract_tensormap_ptr(tma_atom: cute.CopyAtom, *, loc=None, ip=None) -> cute.Pointer:
+    """
+    extract the tensormap pointer from a TMA Copy Atom.
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    """
+    exec_value = _cute_nvgpu_ir.atom_make_exec_tma(tma_atom._trait.value, loc=loc, ip=ip)
+    ptr_type = _cute_ir.PtrType.get(Uint64.mlir_type, _cute_ir.AddressSpace.generic, 64)
+    tensormap_ptr = _cute_nvgpu_ir.get_tma_desc_addr(ptr_type, exec_value, loc=loc, ip=ip)
+    return tensormap_ptr
+
+
+@dsl_user_op
+def tma_load(tma_desc, mbar: cute.Pointer, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Load data from global memory to shared memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  CopyAtom or tensormap_ptr or Tensor of tensormap_ptr
+    :param mbar:                     Mbarrier pointer in shared memory
+    :type mbar:                      Pointer
+    :param smem_ptr:                 Destination pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    if not isinstance(crd, tuple) and isinstance(tma_desc, cute.Pointer):
+        # Legacy signature: tma_load(smem_ptr, gmem_ptr, mbar, size)
+        _smem_ptr = tma_desc
+        _gmem_ptr = mbar
+        _mbar = smem_ptr
+        nvvm.cp_async_bulk_shared_cluster_global(
+            dst_mem=_smem_ptr.llvm_ptr,
+            src_mem=_gmem_ptr.llvm_ptr,
+            mbar=_mbar.llvm_ptr,
+            size=Int32(crd).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_shared_cluster_global(
+            dst_mem=smem_ptr.llvm_ptr,
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            mbar=mbar.llvm_ptr,
+            im2col_offsets=[],
+            load_mode=nvvm.CpAsyncBulkTensorLoadMode.TILE,
+            group=nvvm.Tcgen05GroupKind.CTA_1,
+            use_intrinsic=False,  # set to True would lead to compile error
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store(tma_desc, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Store data from shared memory to global memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  TMA descriptor
+    :param smem_ptr:                 Source pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+    if not isinstance(crd, tuple):
+        if arch not in ("sm_90", "sm_90a"):
+            raise NotImplementedError("tma_store(size) path is only implemented for sm_90/sm_90a")
+        gmem_ptr = tma_desc.align(smem_ptr.alignment)
+        _cute_nvgpu_ir.arch_copy_SM90_bulk_copy_s2g(
+            dsmem_data_addr=smem_ptr.value,
+            gmem_data_addr=gmem_ptr.value,
+            size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), crd),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_global_shared_cta(
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            src_mem=smem_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            predicate=None,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store_arrive(*, loc=None, ip=None) -> None:
+    """
+    Indicate arrival of warp issuing TMA_STORE.
+    Corresponds to PTX instruction: cp.async.bulk.commit_group;
+    """
+    nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def tma_store_wait(count: int, *, read=None, loc=None, ip=None) -> None:
+    """
+    Wait for TMA_STORE operations to complete.
+    Corresponds to PTX instruction: cp.async.bulk.wait_group.read <count>;
+
+    :param count: The number of outstanding bulk async groups to wait for
+    :type count: Int
+    """
+    nvvm.cp_async_bulk_wait_group(group=count, read=read, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_shared_global(
+    dst: cute.Pointer, src: cute.Pointer, cp_size: Int, modifier: nvvm.LoadCacheModifierKind, *, src_size: Int = None, loc=None, ip=None
+) -> None:
+    """
+    Asynchronously copy data from global memory to shared memory.
+
+    :param dst: Destination pointer in shared memory
+    :type dst: Pointer
+    :param src: Source pointer in global memory
+    :type src: Pointer
+    :param size: Size of the copy in bytes
+    :type size: Int
+    :param modifier: Cache modifier
+    :type modifier: Int
+    :param cp_size: Optional copy size override
+    :type cp_size: Int
+    """
+    size = src_size if src_size else cp_size
+    nvvm.cp_async_shared_global(
+        dst=dst.llvm_ptr,
+        src=src.llvm_ptr,
+        size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), size),
+        modifier=modifier,
+        cp_size=Int32(cp_size).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def prefetch_tma_descriptor(tma_desc, *, loc=None, ip=None) -> None:
+    """
+    Prefetch a TMA descriptor.
+    Corresponds to PTX instruction: prefetch.tensormap;
+    """
+    if isinstance(tma_desc, cute.CopyAtom):
+        tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+    elif isinstance(tma_desc, cute.Tensor):
+        tma_desc_ptr = tma_desc.iterator
+    else:
+        tma_desc_ptr = tma_desc
+    nvvm.prefetch_tensormap(tma_desc_ptr.llvm_ptr, loc=loc, ip=ip)
diff --git a/tilelang/contrib/cutedsl/gemm_V1.py b/tilelang/contrib/cutedsl/gemm_V1.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f6cc71e9bde33d7f0f124a2edb6990019f8f063
--- /dev/null
+++ b/tilelang/contrib/cutedsl/gemm_V1.py
@@ -0,0 +1,569 @@
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils  # noqa: F401
+import math
+import cutlass.utils.hopper_helpers as hopper_utils
+from cutlass.utils import LayoutEnum
+from cutlass.cute.nvgpu.warpgroup import OperandMajorMode, OperandSource, make_smem_layout_atom
+
+
+def make_aligned_tensor(ptr: cute.Pointer, layout: cute.Layout, align_bytes: int, swizzle=False):
+    ptr = ptr.align(align_bytes)
+    if swizzle and isinstance(layout, cute.ComposedLayout):
+        ptr = cute.recast_ptr(ptr=ptr, swizzle_=layout.inner, dtype=ptr.dtype)
+        return cute.make_tensor(ptr, layout.outer)
+    return cute.make_tensor(ptr, layout)
+
+
+def gemm_ss(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rs(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from register/fragment and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_sr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from shared memory and B from register/fragment"""
+    # wgmma doesn't support gemm_sr, only use SM80
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    gemm.body_sr(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from register/fragment"""
+    # Both operands in register, no copy needed
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    # For gemm_rr, directly call _body_impl with copy_A=False, copy_B=False
+    gemm._body_impl(A_ptr, B_ptr, C_ptr, copy_A=False, copy_B=False)
+
+
+class Gemm_SM80:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.mma_inst_shape = (16, 8, 16)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self._make_smem_layout_AB(A_type, A_major_mode, 128, (M, K))
+            self.B_layout = self._make_smem_layout_AB(B_type, B_major_mode, 128, (N, K))
+            self.ab_dtype = A_type
+            self.acc_dtype = C_type
+            self.tiled_mma = self._make_tiled_mma(warp_m, warp_n)
+            self.clear_accum = clear_accum
+
+    def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler):
+        is_row_major = major_mode == LayoutEnum.ROW_MAJOR
+        major_mode_size = smem_tiler[1] if is_row_major else smem_tiler[0]
+        major_mode_size = 64 if major_mode_size >= 64 else major_mode_size
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if is_row_major
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            layout_atom_outer,
+        )
+        layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1) if is_row_major else (1, 0))
+        return layout
+
+    def _make_tiled_mma(self, warp_m, warp_n):
+        atom_layout_mnk = (warp_m, warp_n, 1)
+        op = cute.nvgpu.warp.MmaF16BF16Op(self.ab_dtype, self.acc_dtype, self.mma_inst_shape)
+        permutation_mnk = (
+            atom_layout_mnk[0] * self.mma_inst_shape[0],
+            atom_layout_mnk[1] * self.mma_inst_shape[1] * 2,
+            atom_layout_mnk[2] * self.mma_inst_shape[2],
+        )
+        tiled_mma = cute.make_tiled_mma(op, atom_layout_mnk, permutation_mnk)
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body: both A and B from shared memory"""
+        self._body_impl(sA_ptr, sB_ptr, rC_ptr, copy_A=True, copy_B=True)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_rs: A from register, B from shared memory"""
+        self._body_impl(rA_ptr, sB_ptr, rC_ptr, copy_A=False, copy_B=True)
+
+    @cute.jit
+    def body_sr(
+        self,
+        sA_ptr: cute.Pointer,  # A from shared memory
+        rB_ptr: cute.Pointer,  # B already in register
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_sr: A from shared memory, B from register"""
+        self._body_impl(sA_ptr, rB_ptr, rC_ptr, copy_A=True, copy_B=False)
+
+    @cute.jit
+    def _body_impl(
+        self,
+        A_ptr: cute.Pointer,
+        B_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        copy_A: cutlass.Constexpr = True,
+        copy_B: cutlass.Constexpr = True,
+    ):
+        """Internal implementation with configurable copy operations"""
+        tidx, _, _ = cute.arch.thread_idx()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        tCrA = None
+        tCrB = None
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Create copy operations only for operands that need copying
+        if cutlass.const_expr(copy_A):
+            sA = make_aligned_tensor(A_ptr, self.A_layout, 16)
+            tCsA = thr_mma.partition_A(sA)
+            tCrA = self.tiled_mma.make_fragment_A(tCsA)
+            atom_copy_s2r_A = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_A, 4),
+                sA.element_type,
+            )
+            tiled_copy_s2r_A = cute.make_tiled_copy(
+                atom_copy_s2r_A,
+                layout_tv=self.tiled_mma.tv_layout_A_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(0), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx)
+            tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+            tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA)
+        else:
+            # A already in register
+            tCrA = cute.make_tensor(A_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        if cutlass.const_expr(copy_B):
+            sB = make_aligned_tensor(B_ptr, self.B_layout, 16)
+            tCsB = thr_mma.partition_B(sB)
+            tCrB = self.tiled_mma.make_fragment_B(tCsB)
+            atom_copy_s2r_B = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_B, 4),
+                sB.element_type,
+            )
+            tiled_copy_s2r_B = cute.make_tiled_copy(
+                atom_copy_s2r_B,
+                layout_tv=self.tiled_mma.tv_layout_B_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(1), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx)
+            tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+            tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB)
+        else:
+            # B already in register
+            tCrB = cute.make_tensor(B_ptr, self.tiled_mma.partition_shape_B((self.cta_tiler[1], self.cta_tiler[2])))
+
+        if self.clear_accum:
+            tCrC.fill(0)
+
+        for k in cutlass.range(cute.size(tCrA, mode=[2])):
+            if cutlass.const_expr(copy_A):
+                cute.copy(tiled_copy_s2r_A, tCsA_copy_view[None, None, k], tCrA_copy_view[None, None, k])
+            if cutlass.const_expr(copy_B):
+                cute.copy(tiled_copy_s2r_B, tCsB_copy_view[None, None, k], tCrB_copy_view[None, None, k])
+            cute.gemm(self.tiled_mma, tCrC, tCrA[None, None, k], tCrB[None, None, k], tCrC)
+
+
+class Gemm_SM90:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.tiler_mn = (M, N)
+            self.atom_layout_mnk = (warp_m // 4, warp_n, 1)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            self.a_leading_mode = OperandMajorMode.MN if self.trans_A else OperandMajorMode.K
+            self.b_leading_mode = OperandMajorMode.MN if self.trans_B else OperandMajorMode.K
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self.make_smem_layout_AB(A_type, A_major_mode, (M, K))
+            self.B_layout = self.make_smem_layout_AB(B_type, B_major_mode, (N, K))
+            self.a_dtype = A_type
+            self.b_dtype = B_type
+            self.acc_dtype = C_type
+            self.tiled_mma = None
+            self.A_source = None
+            self.clear_accum = clear_accum
+
+    @staticmethod
+    def make_tma_atom(
+        tensor,
+        smem_layout_staged,
+        smem_tile,
+        mcast_dim,
+    ):
+        op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp() if mcast_dim == 1 else cute.nvgpu.cpasync.CopyBulkTensorTileG2SMulticastOp()
+
+        smem_layout = cute.slice_(smem_layout_staged, (None, None, 0))
+
+        tma_atom, tma_tensor = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            op,
+            tensor,
+            smem_layout,
+            smem_tile,
+            num_multicast=mcast_dim,
+        )
+
+        return tma_atom
+
+    @staticmethod
+    def get_tma_atom(tensor, tiler_mk, stages=1):
+        smem_layout_staged = Gemm_SM90.make_smem_layout_AB(tensor.element_type, LayoutEnum.from_tensor(tensor), tiler_mk, stages)
+        tma_atom = Gemm_SM90.make_tma_atom(tensor, smem_layout_staged, tiler_mk, 1)
+        return tma_atom
+
+    @staticmethod
+    def make_smem_layout_AB(dtype, major_mode: LayoutEnum, tiler_mk, stages=1):
+        smem_shape = tiler_mk
+        # Determine if K is the major mode and get the major mode size
+        is_k_major = major_mode.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K
+        major_mode_size = tiler_mk[1] if is_k_major else tiler_mk[0]
+
+        # Create SMEM layout atom for A tensor based on major mode and data type
+        smem_layout_atom = make_smem_layout_atom(
+            hopper_utils.get_smem_layout_atom(major_mode, dtype, major_mode_size),
+            dtype,
+        )
+        # Tile the SMEM layout atom to the A tensor shape and add staging dimension
+        smem_layout = cute.tile_to_shape(smem_layout_atom, cute.append(smem_shape, stages), order=(0, 1, 2) if is_k_major else (1, 0, 2))
+        return smem_layout
+
+    def _make_tiled_mma(self, is_rsMode=False):
+        tiled_mma = hopper_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.b_dtype,
+            self.a_leading_mode,
+            self.b_leading_mode,
+            self.acc_dtype,
+            self.atom_layout_mnk,
+            (64, self.tiler_mn[1] // self.atom_layout_mnk[1]),
+            OperandSource.SMEM if not is_rsMode else OperandSource.RMEM,
+        )
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        sA_ptr = cute.recast_ptr(sA_ptr, self.A_layout.inner, dtype=sA_ptr.dtype)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sA = cute.make_tensor(sA_ptr, self.A_layout.outer)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+
+        tCrA = self.tiled_mma.make_fragment_A(tCsA)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        cute.nvgpu.warpgroup.fence()
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        num_k_blocks = cute.size(tCrA, mode=[2])
+        for k in cutlass.range(num_k_blocks):
+            tCrA_1phase = tCrA[None, None, k, 0]
+            tCrB_1phase = tCrB[None, None, k, 0]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_1phase, tCrB_1phase, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register (Fragment)
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        """
+        GEMM body_rs for SM90/Hopper: A from register, B from shared memory.
+        Based on cute::tl_wgmma::GemmTensorOp::body_rs from gemm_sm90.h
+        """
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma(is_rsMode=True)
+        # if self.A_source != OperandSource.RMEM or self.tiled_mma is None:
+        #     self.tiled_mma = self._make_tiled_mma(is_rsMode = True)
+        #     self.A_source = OperandSource.RMEM
+        # B from shared memory (with swizzle)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        # Use the existing tiled_mma
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        # Partition B from shared memory - standard path
+        tCsB = thr_mma.partition_B(sB)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+
+        # A already in register
+        # For body_rs, A is NOT partitioned through thr_mma (it's already partitioned)
+        # We create the tensor directly with the full shape
+        # This matches C++: make_tensor(make_rmem_ptr(pA), partition_shape_A(...))
+        tCrA = cute.make_tensor(rA_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        # C accumulator
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Fence operands (prepare for wgmma)
+        cute.nvgpu.warpgroup.fence()
+        # Note: warpgroup_arrive() is called internally by wgmma
+        # Set accumulation mode
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        # GEMM loop
+        num_k_blocks = cute.size(tCrB, mode=[2])
+        for k_block in cutlass.range(num_k_blocks):
+            # Match the indexing pattern from __call__
+            # If tCrB has 4 dimensions (with pipeline), use [None, None, k, 0]
+            # Otherwise use [None, None, k]
+            tCrB_k = tCrB[None, None, k_block, 0] if cute.rank(tCrB) >= 4 else tCrB[None, None, k_block]
+            tCrA_k = tCrA[None, None, k_block, 0] if cute.rank(tCrA) >= 4 else tCrA[None, None, k_block]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_k, tCrB_k, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
diff --git a/tilelang/contrib/cutedsl/ldsm.py b/tilelang/contrib/cutedsl/ldsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f36026975e3e7337a9c1a6eb9dc6f838a55f25c
--- /dev/null
+++ b/tilelang/contrib/cutedsl/ldsm.py
@@ -0,0 +1,127 @@
+"""
+LDMATRIX and STMATRIX operations for CuTeDSL backend.
+Based on tl_templates/cuda/ldsm.h
+
+These functions provide wrappers around PTX ldmatrix/stmatrix instructions
+for loading/storing 8x8 matrix fragments between shared memory and registers.
+"""
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import nvvm, llvm
+from cutlass._mlir import ir  # noqa: F401
+from cutlass.cute.typing import Pointer, Int32  # noqa: F401
+import cutlass.cute as cute
+
+
+def _to_ir_value(v, loc=None, ip=None):
+    """Convert value to MLIR IR, handling both cutlass types and raw MLIR Values"""
+    if hasattr(v, "ir_value"):
+        return v.ir_value(loc=loc, ip=ip)
+    else:
+        # Already an MLIR Value
+        return v
+
+
+def _ldmatrix(smem_ptr, local_ptr, num, transpose, loc=None, ip=None):
+    """Internal helper for ldmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    assert num in [2, 4]
+    ret_type = llvm.StructType.get_literal([T.i32()] * num)
+    out_i32 = nvvm.ldmatrix(ret_type, smem_ptr.llvm_ptr, num=num, layout=layout, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), num)
+    for i in range(num):
+        out[i] = cute.Int32(llvm.extractvalue(T.i32(), out_i32, [i], loc=loc, ip=ip))
+
+
+def _stmatrix(smem_ptr, values, transpose, loc=None, ip=None):
+    """Internal helper for stmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    ir_values = [_to_ir_value(v, loc, ip) for v in values]
+    nvvm.stmatrix(smem_ptr.llvm_ptr, ir_values, layout=layout, loc=loc, ip=ip)
+
+
+# ============================================================================
+# LDMATRIX operations (load from shared memory to registers)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) from shared memory"""
+    # _ldmatrix(smem_ptr, local_ptr, 1, False, loc, ip)
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.row, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) with transpose from shared memory"""
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.col, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, True, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, True, loc, ip)
+
+
+# ============================================================================
+# STMATRIX operations (store from registers to shared memory)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_stmatrix_x1(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) to shared memory"""
+    _stmatrix(smem_ptr, [value0], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x1_trans(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2_trans(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4_trans(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], True, loc, ip)
diff --git a/tilelang/contrib/cutedsl/math.py b/tilelang/contrib/cutedsl/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f775091bee84a089eefce999d26a7c634f161e5
--- /dev/null
+++ b/tilelang/contrib/cutedsl/math.py
@@ -0,0 +1,9 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Union, Numeric
+from cutlass.cute.tensor import TensorSSA
+from cutlass._mlir.dialects import arith
+from cutlass.cute.math import exp, exp2, log, log2, log10, tan, cos, sin, sqrt  # noqa: F401
+
+
+def divf(x: Union[TensorSSA, Numeric], y: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    return cute.math._math_op(arith.divf, fastmath, x, y)
diff --git a/tilelang/contrib/cutedsl/mbar.py b/tilelang/contrib/cutedsl/mbar.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca956e2f499c316db2c9c11ec12e8be58b46d7f5
--- /dev/null
+++ b/tilelang/contrib/cutedsl/mbar.py
@@ -0,0 +1,45 @@
+"""
+Simple wrappers that delegate to cutlass.cute.arch implementations.
+We use the existing implementations from cutlass rather than reinventing the wheel.
+"""
+
+from cutlass.cute.typing import Pointer, Int, Int32, Boolean  # noqa: F401
+from cutlass.cutlass_dsl import CuTeDSL, dsl_user_op  # noqa: F401
+from cutlass._mlir.dialects import nvvm
+
+from cutlass.cute.arch import mbarrier_init, mbarrier_expect_tx, mbarrier_arrive  # noqa: F401
+from cutlass.cute.arch import mbarrier_arrive_and_expect_tx as arrive_and_expect_tx  # noqa: F401
+from cutlass.cute.arch import cp_async_mbarrier_arrive_noinc as mbarrier_cp_async_arrive_noinc  # noqa: F401
+
+import cutlass.cute.arch as arch
+
+
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, timeout_ns: Int = 10000000, *, loc=None, ip=None) -> None:
+    """Waits on a mbarrier with a specified phase."""
+    nvvm.mbarrier_try_wait_parity_shared(
+        mbar_ptr.llvm_ptr,
+        Int32(phase).ir_value(loc=loc, ip=ip),
+        Int32(timeout_ns).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_cp_async_arrive(mbar_ptr: Pointer, *, loc=None, ip=None) -> None:
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_llvm_ptr,
+        noinc=False,
+        loc=loc,
+        ip=ip,
+    )
+
+
+def fence_proxy_async():
+    arch.fence_proxy(arch.ProxyKind.async_shared, space=arch.SharedSpace.shared_cta)
+
+
+def fence_barrier_init():
+    arch.mbarrier_init_fence()
diff --git a/tilelang/contrib/cutedsl/reduce.py b/tilelang/contrib/cutedsl/reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..f835b149b625b0bfe8cb94b22876bfaf768889be
--- /dev/null
+++ b/tilelang/contrib/cutedsl/reduce.py
@@ -0,0 +1,186 @@
+"""
+Reduce operations for CuTeDSL backend.
+Based on tl_templates/cuda/reduce.h
+"""
+
+from __future__ import annotations
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.typing import Int32, Float32
+from cutlass.cutlass_dsl import dsl_user_op, T
+from cutlass._mlir.dialects import nvvm
+from cutlass.cute.arch.nvvm_wrappers import shuffle_sync_op
+
+
+@dsl_user_op
+def min(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmin(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def max(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmax(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+class SumOp:
+    """Sum reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x + y
+
+
+class MaxOp:
+    """Max reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return max(x, y)
+
+
+class MinOp:
+    """Min reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        # Use cutlass.min which is JIT-friendly
+        return min(x, y)
+
+
+class BitAndOp:
+    """Bitwise AND reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x & y
+
+
+class BitOrOp:
+    """Bitwise OR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x | y
+
+
+class BitXorOp:
+    """Bitwise XOR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x ^ y
+
+
+def bar_sync(barrier_id, number_of_threads):
+    cute.arch.barrier(barrier_id=barrier_id, number_of_threads=number_of_threads)
+
+
+def bar_sync_ptx(barrier_id, number_of_threads):
+    from cutlass._mlir.dialects import llvm
+
+    llvm.inline_asm(
+        None,
+        [Int32(barrier_id).ir_value(), Int32(number_of_threads).ir_value()],
+        "bar.sync $0, $1;",
+        "r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+def AllReduce(reducer, threads, scale, thread_offset, all_threads=None):
+    """
+    AllReduce operation implementing warp/block-level reduction.
+    Based on tl::AllReduce from reduce.h
+
+    Args:
+        reducer: Reducer operator class (SumOp, MaxOp, etc.)
+        threads: Number of threads participating in reduction
+        scale: Reduction scale factor
+        thread_offset: Thread ID offset
+        all_threads: Total number of threads in block
+
+    Returns:
+        A callable object with run() and run_hopper() methods
+    """
+
+    class AllReduceInstance:
+        def __init__(self, reducer, threads, scale, thread_offset: cutlass.Constexpr[int], all_threads: cutlass.Constexpr[int]):
+            self.reducer = reducer
+            self.threads = threads
+            self.scale = scale
+            self.thread_offset = thread_offset
+            self.all_threads = all_threads if all_threads is not None else threads
+
+        def run(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce across threads.
+            Based on tl::AllReduce<...>::run from reduce.h
+            """
+            offset = self.threads // 2
+
+            if offset >= 32:
+                # Use shared memory for large thread counts
+                cute.arch.sync_threads()
+                tidx, _, _ = cute.arch.thread_idx()
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                cute.arch.sync_threads()
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run(x, red_buf)
+            )
+
+        def run_hopper(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce on Hopper architecture using bar.sync.
+            Based on tl::AllReduce<...>::run_hopper from reduce.h
+            """
+            offset = self.threads // 2
+            tidx, _, _ = cute.arch.thread_idx()
+            if offset >= 32:
+                # Use inlined asm for bar.sync to avoid instruction reordering
+                bar_sync_ptx(1, self.all_threads)
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                bar_sync_ptx(2, self.all_threads)
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run_hopper(x, red_buf)
+            )
+
+    return AllReduceInstance(reducer, threads, scale, thread_offset, all_threads)
diff --git a/tilelang/contrib/cutedsl/threadblock_swizzle.py b/tilelang/contrib/cutedsl/threadblock_swizzle.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce78eb86061e9cec465d9a6754cf7af1edfbf05
--- /dev/null
+++ b/tilelang/contrib/cutedsl/threadblock_swizzle.py
@@ -0,0 +1,54 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Constexpr
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class dim3:
+    x: int
+    y: int
+    z: int
+
+
+def ThreadIdx() -> dim3:
+    return dim3(*cute.arch.thread_idx())
+
+
+def BlockIdx() -> dim3:
+    return dim3(*cute.arch.block_idx())
+
+
+def GridDim() -> dim3:
+    return dim3(*cute.arch.grid_dim())
+
+
+@cute.jit
+def rasterization2DRow(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.x
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.x
+    col_idx = (gridDim.x - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    row_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
+
+
+@cute.jit
+def rasterization2DColumn(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.y
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.y
+    row_idx = (gridDim.y - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    col_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
diff --git a/tilelang/contrib/dlpack.py b/tilelang/contrib/dlpack.py
index e61d80cee1bfea0a5ac2497aca1ea933b00b877c..d80f0fdbc39302ec687893fb6d92c25406a657c8 100644
--- a/tilelang/contrib/dlpack.py
+++ b/tilelang/contrib/dlpack.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Wrapping functions to bridge frameworks with DLPack support to TVM"""
+
 from tvm import runtime
 
 
@@ -45,12 +46,8 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
 
     def adapt_tensor(arg):
         if isinstance(arg, tensor_type):
-            if arg.dtype in {
-                    torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2,
-                    torch.float8_e5m2fnuz
-            }:
-                return runtime.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(
-                    arg.shape, dtype=float8_dtype_map[arg.dtype])
+            if arg.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz}:
+                return runtime.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(arg.shape, dtype=float8_dtype_map[arg.dtype])
             return runtime.from_dlpack(to_dlpack_func(arg))
         return arg
 
@@ -59,23 +56,3 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
         return tvm_func(*args)
 
     return _wrapper
-
-
-def to_pytorch_func(tvm_func):
-    """Convert a tvm function into one that accepts PyTorch tensors
-
-    Parameters
-    ----------
-    tvm_func: Function
-        Built tvm function operating on arrays
-
-    Returns
-    -------
-    wrapped_func: Function
-        Wrapped tvm function that operates on PyTorch tensors
-    """
-    # pylint: disable=import-outside-toplevel
-    import torch
-    import torch.utils.dlpack
-
-    return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack)
diff --git a/tilelang/contrib/hipcc.py b/tilelang/contrib/hipcc.py
index 0a3917df662aee7114ca57c81157fbdd0edcbe9e..7b7f9f9479e54dcd03d0b95779ce65fef59e448f 100644
--- a/tilelang/contrib/hipcc.py
+++ b/tilelang/contrib/hipcc.py
@@ -16,12 +16,7 @@ from tvm.base import py_str
 from tvm.contrib.rocm import get_rocm_arch, find_rocm_path
 
 
-def compile_hip(code,
-                target_format="hsaco",
-                arch=None,
-                options=None,
-                path_target=None,
-                verbose=False):
+def compile_hip(code, target_format="hsaco", arch=None, options=None, path_target=None, verbose=False):
     """Compile HIP code with hipcc.
 
     Parameters
@@ -61,8 +56,7 @@ def compile_hip(code,
 
     file_target = path_target if path_target else temp_target
     cmd = ["hipcc"]
-    cmd += ["-O1", '-c']
-    cmd += ["-Wno-invalid-constexpr"]
+    cmd += ["-O3", "-c"]
     if isinstance(arch, str):
         cmd += [f"--offload-arch={arch}"]
     if target_format == "hsaco":
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 2903b15d49aad8a830a414b717188b7a47d85f92..36df6c875e198e2770fc3a449f0bc652fc3bb75e 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -1,7 +1,7 @@
 # pylint: disable=invalid-name
 # modified from apache tvm python/tvm/contrib/nvcc.py
 """Utility to invoke nvcc compiler in the system"""
-from __future__ import absolute_import as _abs
+
 from __future__ import annotations
 
 import os
@@ -19,12 +19,7 @@ from tvm.base import py_str
 from tvm.contrib import utils
 
 
-def compile_cuda(code,
-                 target_format="ptx",
-                 arch=None,
-                 options=None,
-                 path_target=None,
-                 verbose=False):
+def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target=None, verbose=False):
     """Compile cuda code with NVCC from env.
 
     Parameters
@@ -68,7 +63,7 @@ def compile_cuda(code,
     temp_target = temp.relpath(f"{file_name}.{target_format}")
 
     pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
-    kernels_output_dir = (pass_context.config.get("cuda.kernels_output_dir", None))
+    kernels_output_dir = pass_context.config.get("cuda.kernels_output_dir", None)
     if kernels_output_dir is not None:
         if not os.path.isdir(kernels_output_dir):
             os.makedirs(kernels_output_dir)
@@ -79,10 +74,10 @@ def compile_cuda(code,
         out_file.write(code)
 
     file_target = path_target if path_target else temp_target
-    cmd = ["nvcc"]
+    cmd = [get_nvcc_compiler()]
     cmd += [f"--{target_format}", "-O3"]
-    if kernels_output_dir is not None:
-        cmd += ["-lineinfo"]
+    # Always include line info for better profiling and mapping
+    cmd += ["-lineinfo"]
     if isinstance(arch, list):
         cmd += arch
     elif isinstance(arch, str):
@@ -115,10 +110,7 @@ def compile_cuda(code,
         print(py_str(out))
 
     if proc.returncode != 0:
-        msg = f"{code}\n" \
-            f"Compilation error:\n" \
-            f"{py_str(out)}\n" \
-            f"Command: {' '.join(cmd)}\n"
+        msg = f"{code}\nCompilation error:\n{py_str(out)}\nCommand: {' '.join(cmd)}\n"
         raise RuntimeError(msg)
 
     with open(file_target, "rb") as f:
@@ -166,6 +158,7 @@ def default_compile_options(compile_flags: list[str] | None = None) -> list[str]
     # (e.g., multiple "-gencode" pairs or repeated "-Xcompiler" entries).
     if compile_flags:
         import shlex
+
         for flag in compile_flags:
             # Split each string like a shell would, preserving quoted args
             tokens = shlex.split(flag) if isinstance(flag, str) else [str(flag)]
@@ -173,9 +166,7 @@ def default_compile_options(compile_flags: list[str] | None = None) -> list[str]
     return options
 
 
-def get_ptx_from_source(code: str,
-                        compile_flags: list[str] | None = None,
-                        verbose: bool = False) -> str:
+def get_ptx_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
     """
     Compile CUDA C++ source to PTX using NVCC and return as text.
 
@@ -213,9 +204,7 @@ def _find_tool(name: str) -> str | None:
     return None
 
 
-def get_sass_from_source(code: str,
-                         compile_flags: list[str] | None = None,
-                         verbose: bool = False) -> str:
+def get_sass_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
     """
     Compile CUDA C++ source to CUBIN and disassemble to SASS.
 
@@ -247,9 +236,7 @@ def get_sass_from_source(code: str,
     cand_nvdisasm = _find_tool("nvdisasm")
     cand_cuobjdump = _find_tool("cuobjdump")
     if not cand_nvdisasm and not cand_cuobjdump:
-        raise RuntimeError(
-            "Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH."
-        )
+        raise RuntimeError("Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH.")
     last_err: str | None = None
     try:
         # Attempt nvdisasm first
@@ -269,8 +256,7 @@ def get_sass_from_source(code: str,
                 return text
             last_err = f"{tool_name} rc={proc.returncode}, output:\n{text}"
         # If we reach here, all attempts failed
-        raise RuntimeError(f"SASS disassembly failed. Tried tools: "
-                           f"{', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
+        raise RuntimeError(f"SASS disassembly failed. Tried tools: {', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
     finally:
         with contextlib.suppress(Exception):
             os.remove(cubin_path)
@@ -333,13 +319,6 @@ def get_cuda_version(cuda_path=None):
     raise RuntimeError("Cannot read cuda version file")
 
 
-@tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):  # pylint: disable=unused-argument
-    """use nvcc to generate fatbin code for better optimization"""
-    ptx = compile_cuda(code, target_format="fatbin")
-    return ptx
-
-
 @tvm_ffi.register_global_func("tilelang_callback_libdevice_path", override=True)
 def find_libdevice_path(arch):
     """Utility function to find libdevice
@@ -446,8 +425,7 @@ def get_target_compute_version(target=None):
     if tvm.cuda(0).exist:
         return tvm.cuda(0).compute_version
 
-    raise ValueError("No CUDA architecture was specified or GPU detected."
-                     "Try specifying it by adding '-arch=sm_xx' to your target.")
+    raise ValueError("No CUDA architecture was specified or GPU detected.Try specifying it by adding '-arch=sm_xx' to your target.")
 
 
 def parse_compute_version(compute_version) -> tuple[int, int]:
@@ -532,7 +510,8 @@ def have_tensorcore(compute_version=None, target=None):
                 warnings.warn(
                     "Tensorcore will be disabled due to no CUDA architecture specified."
                     "Try specifying it by adding '-arch=sm_xx' to your target.",
-                    stacklevel=2)
+                    stacklevel=2,
+                )
                 return False
             compute_version = target.attrs["arch"]
             # Compute version will be in the form "sm_{major}{minor}"
diff --git a/tilelang/contrib/nvrtc.py b/tilelang/contrib/nvrtc.py
index b691155497c6db8c7093168d037f71fd19705061..105c518198b66aa1816d67b006d66c4d11b8f3cc 100644
--- a/tilelang/contrib/nvrtc.py
+++ b/tilelang/contrib/nvrtc.py
@@ -11,11 +11,13 @@ def get_nvrtc_version() -> tuple[int, int]:
     return (major, minor)
 
 
-def compile_cuda(code: str,
-                 target_format: Literal["ptx", "cubin"] = "ptx",
-                 arch: int | None = None,
-                 options: str | list[str] | None = None,
-                 verbose: bool = False) -> bytearray:
+def compile_cuda(
+    code: str,
+    target_format: Literal["ptx", "cubin"] = "ptx",
+    arch: int | None = None,
+    options: str | list[str] | None = None,
+    verbose: bool = False,
+) -> bytearray:
     """Compile cuda code with NVRTC.
 
     Parameters
@@ -43,8 +45,7 @@ def compile_cuda(code: str,
     if arch is None:
         # If None, then it will use `tvm.target.Target.current().arch`.
         # Target arch could be a str like "80", "90", "90a", etc.
-        major, minor = parse_compute_version(
-            get_target_compute_version(Target.current(allow_none=True)))
+        major, minor = parse_compute_version(get_target_compute_version(Target.current(allow_none=True)))
         arch = major * 10 + minor
     prefix = "compute" if target_format == "ptx" else "sm"
     suffix = "a" if arch >= 90 else ""
@@ -77,8 +78,7 @@ def compile_cuda(code: str,
     compile_result = nvrtc.nvrtcCompileProgram(program, len(options_bytes), options_bytes)[0]
 
     if compile_result != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        msg = f"{code}\n" \
-            f"Compilation error:\n"
+        msg = f"{code}\nCompilation error:\n"
         if verbose:
             result, log_size = nvrtc.nvrtcGetProgramLogSize(program)
             assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get program log size: {result}"
@@ -105,7 +105,6 @@ def compile_cuda(code: str,
         assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get PTX: {result}"
 
     # Destroy handler
-    assert nvrtc.nvrtcDestroyProgram(
-        program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
+    assert nvrtc.nvrtcDestroyProgram(program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
 
     return result_bytes
diff --git a/tilelang/contrib/rocm.py b/tilelang/contrib/rocm.py
index fa6addf855dd9df6f09a7fbc832aa07f88da95eb..d6b0e375622f3fcdc7c1883720fcb8ad438035ca 100644
--- a/tilelang/contrib/rocm.py
+++ b/tilelang/contrib/rocm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility for ROCm backend"""
+
 # ruff: noqa
 import re
 import subprocess
@@ -257,9 +258,11 @@ def get_rocm_arch(rocm_path="/opt/rocm"):
             gpu_arch = match.group(1)
         return gpu_arch
     except subprocess.CalledProcessError:
-        print(f"Unable to execute rocminfo command, \
+        print(
+            f"Unable to execute rocminfo command, \
                 please ensure ROCm is installed and you have an AMD GPU on your system.\
-                    using default {gpu_arch}.")
+                    using default {gpu_arch}."
+        )
         return gpu_arch
 
 
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 80d49a182d7de1395a8988d0bbdff2c498424e80..b44e243f055b20018222c7aa0f651e855ae6ee7c 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -1,4 +1,5 @@
 """The compiler for TL programs."""
+
 from __future__ import annotations
 
 import os
@@ -11,9 +12,12 @@ import tvm_ffi
 from tvm.ir import CallingConv
 from tvm.target import Target
 from tilelang.contrib import hipcc, nvcc
+from tilelang.transform import PassConfigKey
+from tilelang.utils.deprecated import deprecated_warning
 from tilelang.engine.param import KernelParam, CompiledArtifact
 from tilelang.utils.target import determine_target
 from tilelang.engine.phase import (
+    PreLowerSemanticCheck,
     LowerAndLegalize,
     OptimizeForTarget,
 )
@@ -25,14 +29,13 @@ def is_cpu_device_backend(target: Target):
 
 def has_device_kernel_launch(attrs) -> bool:
     """Check if the attributes indicate a device kernel launch."""
-    return bool(attrs and "calling_conv" in attrs and
-                attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
+    return bool(attrs and "calling_conv" in attrs and attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
 
 
 def is_device_call_c_device(func: tir.PrimFunc):
     attrs = func.attrs
     calling_conv = attrs.get("calling_conv", CallingConv.DEFAULT)
-    is_cpacked = (calling_conv == CallingConv.C_PACKED_FUNC)
+    is_cpacked = calling_conv == CallingConv.C_PACKED_FUNC
 
     # Check if it's a C target
     if "target" in attrs and attrs["target"].kind.name == "c" and not is_cpacked:
@@ -54,7 +57,7 @@ def get_host_call(is_device_c: bool = False) -> Callable[[tir.PrimFunc], bool]:
 
 
 @tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):
+def tilelang_callback_cuda_compile(code, target, pass_config=None):
     project_root = osp.join(osp.dirname(__file__), "../..")
     if "TL_TEMPLATE_PATH" in os.environ:
         tl_template_path = os.environ["TL_TEMPLATE_PATH"]
@@ -69,21 +72,53 @@ def tilelang_callback_cuda_compile(code, target):
     target_arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
 
     arch = [f"-arch=sm_{target_arch}"]
-    format = "cubin"
+    compile_format = "cubin"
+
+    # Read pass-config keys (string-valued) like in jit.adapter.libgen.compile_lib
+    cfg = pass_config or {}
+    if cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH, False):
+        deprecated_warning("TL_DISABLE_FAST_MATH", "TL_ENABLE_FAST_MATH", "0.1.7")
+        disable_fast_math = bool(cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH, True))
+        enable_fast_math = not disable_fast_math
+    else:
+        enable_fast_math = bool(cfg.get(PassConfigKey.TL_ENABLE_FAST_MATH, False))
+
+    ptxas_usage_level = cfg.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+    verbose_ptxas_output = bool(cfg.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False))
+
+    options = [
+        "-std=c++17",
+        "-I" + tl_template_path,
+        "-I" + cutlass_path,
+    ]
+    # Merge extra device compiler flags from pass config, if provided
+    extra_flags = cfg.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS, None)
+    if extra_flags:
+        import shlex
+
+        if isinstance(extra_flags, str):
+            tokens = shlex.split(extra_flags)
+        else:
+            tokens = []
+            for flag in extra_flags:
+                if isinstance(flag, str):
+                    tokens.extend(shlex.split(flag))
+                else:
+                    tokens.append(str(flag))
+        options += tokens
+
+    if enable_fast_math:
+        options.append("--use_fast_math")
+    if ptxas_usage_level is not None:
+        options.append(f"--ptxas-options=--register-usage-level={ptxas_usage_level}")
+    if verbose_ptxas_output:
+        options.append("--ptxas-options=--verbose")
 
-    # printing out number of registers
-    debug_option = "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage"
     ptx = nvcc.compile_cuda(
         code,
-        format,
+        compile_format,
         arch,
-        options=[
-            "-std=c++17",
-            debug_option,
-            "--use_fast_math",
-            "-I" + tl_template_path,
-            "-I" + cutlass_path,
-        ],
+        options=options,
         verbose=False,
     )
 
@@ -123,12 +158,16 @@ def extrac_params(func: tir.PrimFunc) -> list[KernelParam]:
         if var in func.buffer_map:
             tensor_types.append(KernelParam.from_buffer(func.buffer_map[var]))
         else:
+            if var.dtype == "handle":
+                raise ValueError(
+                    f"Handle parameter {var} must be mapped to a buffer.\n"
+                    f"Please use T.tensor({var.name}, shape=..., dtype=...) to map it to a buffer."
+                )
             tensor_types.append(KernelParam.from_var(var))
     return tensor_types
 
 
 def canon_target_host(target: str | Target, target_host: str | Target | None):
-
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "c"
 
@@ -147,7 +186,7 @@ def host_codegen(host_mod: tvm.IRModule, target_host: Target) -> tvm.IRModule:
     if target_host.kind.name == "llvm":
         host_mod = tvm.ffi.get_global_func("target.build.llvm")(host_mod, target_host)
     elif target_host.kind.name == "c":
-        host_mod = tvm.ffi.get_global_func("target.build.c")(host_mod, target_host)
+        host_mod = tvm.ffi.get_global_func("target.build.tilelang_c")(host_mod, target_host)
     else:
         raise ValueError(f"Target host {target_host.kind.name} is not supported")
     return host_mod
@@ -159,7 +198,8 @@ def device_codegen(device_mod: tvm.IRModule, target: Target) -> tvm.IRModule:
     device_mod = tir.transform.Simplify()(device_mod)
 
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda")(device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda")
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip")(device_mod, target)
     else:
@@ -173,11 +213,10 @@ def device_codegen_without_compile(device_mod: tvm.IRModule, target: Target) ->
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda_without_compile")(
-            device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda") + "_without_compile"
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(
-            device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(device_mod, target)
     elif target.kind.name == "c":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_cpp")(device_mod, target)
     elif target.kind.name == "llvm":
@@ -200,12 +239,12 @@ def lower(
     enable_host_codegen=False,
     enable_device_compile=False,
 ) -> CompiledArtifact:
-    '''
-        enable_host_codegen: whether to enable host codegen, default is False, as we have our
-        own host codegen implementation in jit.
-        enable_device_compile: whether to enable device codegen, default is False, as we have our
-        own device codegen implementation in jit.
-    '''
+    """
+    enable_host_codegen: whether to enable host codegen, default is False, as we have our
+    own host codegen implementation in jit.
+    enable_device_compile: whether to enable device codegen, default is False, as we have our
+    own device codegen implementation in jit.
+    """
 
     mod = func_or_mod
     params = None
@@ -225,6 +264,9 @@ def lower(
     _is_host_call = get_host_call(is_device_c=is_cpu_device_backend(target))
     _is_device_call = get_device_call(is_device_c=is_cpu_device_backend(target))
 
+    # Before lowering, do semantic check
+    PreLowerSemanticCheck(mod)
+
     # Phase 1: Lower and legalize the IR
     mod = LowerAndLegalize(mod, target)
 
@@ -234,14 +276,11 @@ def lower(
     host_mod = tir.transform.Filter(_is_host_call)(mod)
     device_mod = tir.transform.Filter(_is_device_call)(mod)
 
-    codegen_mod = device_codegen(
-        device_mod, target) if enable_device_compile else device_codegen_without_compile(
-            device_mod, target)
+    codegen_mod = device_codegen(device_mod, target) if enable_device_compile else device_codegen_without_compile(device_mod, target)
 
     if enable_host_codegen:
         host_mod = host_codegen(host_mod, target_host)
         host_mod.import_module(codegen_mod)
-        return CompiledArtifact(
-            host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
+        return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
 
     return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source())
diff --git a/tilelang/engine/param.py b/tilelang/engine/param.py
index de3c979ea1a55934dda9ea9959b7f8ffc080f006..fe023f83fa4202b7a7ab8bb4220de89318f7cfd8 100644
--- a/tilelang/engine/param.py
+++ b/tilelang/engine/param.py
@@ -1,11 +1,12 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from dataclasses import dataclass
 import torch
 from tilelang import tvm as tvm
 from tvm.tir import Buffer, IntImm, Var, PrimExpr
-from tilelang.utils.tensor import map_torch_type
+import tilelang.language as T
 
 
 @dataclass
@@ -14,7 +15,12 @@ class KernelParam:
     Represents parameters for a kernel operation, storing dtype and shape information.
     Used to describe tensor or scalar parameters in TVM/PyTorch interop.
     """
-    dtype: torch.dtype  # PyTorch data type of the parameter
+
+    # Use tvm.DataType (buffer.dtype) directly instead of torch.dtype to support more data types
+    # tvm.DataType can represent a much wider range of types than PyTorch's dtype system,
+    # including specialized types like float8_e4m3, float8_e5m2, custom quantized types, etc.
+    # This avoids information loss when converting from TVM buffer types
+    dtype: tvm.DataType  # Data type from buffer.dtype (supports all TVM types)
     shape: list[int | Var]  # List of dimensions, can be integers or TVM variables
 
     @classmethod
@@ -26,12 +32,14 @@ class KernelParam:
             buffer: TVM Buffer object containing dtype and shape information
 
         Returns:
-            KernelParam instance with converted dtype and shape
+            KernelParam instance with dtype directly from buffer and shape
 
         Raises:
             ValueError: If dimension type is not supported (not IntImm or Var)
         """
-        dtype = map_torch_type(buffer.dtype)
+        # Use buffer.dtype directly (tvm.DataType) to preserve all type information
+        # buffer.dtype is already a tvm.DataType object, no conversion needed
+        dtype = buffer.dtype
         shape = []
         for s in buffer.shape:
             if isinstance(s, IntImm):
@@ -54,7 +62,9 @@ class KernelParam:
         Returns:
             KernelParam instance representing a scalar (empty shape)
         """
-        dtype = map_torch_type(var.dtype)
+        # Use var.dtype directly (tvm.DataType) to preserve all type information
+        # var.dtype is already a tvm.DataType object, no conversion needed
+        dtype = var.dtype
         return cls(dtype, [])
 
     def is_scalar(self) -> bool:
@@ -90,6 +100,18 @@ class KernelParam:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("float8")
 
+    def is_float4(self) -> bool:
+        """
+        Checks if the parameter represents a float4 type.
+
+        Returns:
+            bool: True if parameter is a float4 type, False otherwise
+        """
+        dtype_str = str(self.dtype)
+        if dtype_str.startswith("torch."):
+            dtype_str = dtype_str[6:]
+        return dtype_str.startswith("float4")
+
     def is_boolean(self) -> bool:
         """
         Checks if the parameter represents a boolean type.
@@ -102,6 +124,22 @@ class KernelParam:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("bool")
 
+    def torch_dtype(self) -> torch.dtype:
+        """
+        Converts the TVM DataType to PyTorch dtype.
+
+        This method is used when creating PyTorch tensors from KernelParam,
+        as PyTorch's tensor creation functions require torch.dtype.
+
+        Returns:
+            torch.dtype: Corresponding PyTorch dtype
+
+        Example:
+            >>> param = KernelParam.from_buffer(buffer)
+            >>> tensor = torch.empty(shape, dtype=param.torch_dtype())
+        """
+        return T.dtype(self.dtype).as_torch()
+
 
 @dataclass
 class CompiledArtifact:
@@ -109,6 +147,7 @@ class CompiledArtifact:
     Represents a compiled kernel artifact containing both host and device code.
     Stores all necessary components for kernel execution in the TVM runtime.
     """
+
     host_mod: tvm.IRModule  # Host-side TVM IR module for managing kernel execution
     device_mod: tvm.IRModule  # Device-side TVM IR module containing the actual kernel code
     params: list[KernelParam]  # List of parameters (tensors/scalars) used by the kernel
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index a7cc99f8ad1b7425fb9cf244510f77aff8e9d8cf..0e72c837e945cb82a2d2d7b35afa7594ce9c29eb 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -6,8 +6,7 @@ from tilelang.transform import PassContext
 from tilelang.contrib.nvcc import have_tma, is_hopper
 
 
-def allow_warp_specialized(pass_ctx: PassContext | None = None,
-                           target: Target | None = None) -> bool:
+def allow_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     # avoid circular import
     from tilelang.jit.adapter.utils import is_cuda_target
 
@@ -19,8 +18,7 @@ def allow_warp_specialized(pass_ctx: PassContext | None = None,
     return not disable_warp_specialized
 
 
-def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
     if not have_tma(target):
@@ -47,12 +45,10 @@ def allow_global_thread_synchronization(pass_ctx: PassContext | None = None) ->
     return enable_global_thread_sync
 
 
-def should_enable_aggressive_merge(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def should_enable_aggressive_merge(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
-    enable_aggressive_merge = bool(
-        pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
+    enable_aggressive_merge = bool(pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
     if allow_warp_specialized(pass_ctx=pass_ctx, target=target):
         # This is a workaround to avoid the bug in the MergeSharedMemoryAllocations pass
         # when warp specialization is enabled, as different warp threads may access different
@@ -67,6 +63,63 @@ def should_force_let_inline(pass_ctx: PassContext | None = None) -> bool:
     return bool(pass_ctx and pass_ctx.config.get(tilelang.PassConfigKey.TL_FORCE_LET_INLINE, False))
 
 
+def should_enable_layout_visual(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    enabled = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE, False)
+    return enabled
+
+
+def get_layout_visual_formats(pass_ctx: PassContext | None = None) -> list[str]:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    formats_value = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS, "")
+    if not formats_value:
+        return ["txt"]
+
+    formats_str = formats_value.strip().lower()
+    valid_formats = ["txt", "png", "pdf", "svg", "all"]
+
+    if formats_str == "all":
+        return ["txt", "png", "pdf", "svg"]
+
+    if "," in formats_str:
+        formats_list = [f.strip() for f in formats_str.split(",")]
+    else:
+        formats_list = [formats_str]
+
+    invalid_formats = [f for f in formats_list if f not in valid_formats]
+    if invalid_formats:
+        raise ValueError(
+            f"Invalid formats for TL_LAYOUT_VISUALIZATION_FORMATS: {invalid_formats}. "
+            f"Valid formats are: {valid_formats}. "
+            f"You can choose one of the valid formats or a comma-separated list of formats.(e.g., 'txt,png,pdf')"
+        )
+    return formats_list
+
+
+def LayoutVisual(mod: IRModule) -> None:
+    """Apply layout visualization pass if enabled."""
+    if should_enable_layout_visual():
+        formats = get_layout_visual_formats()
+        tilelang.analysis.LayoutVisual(formats=formats)(mod)
+
+
+def PreLowerSemanticCheck(mod: IRModule) -> None:
+    """
+    Check whether the module is valid before lowering. If not, raise a user-friendly error
+    in Python side instead of letting the error dive into the complicated TVM/C++ stack.
+    Note: This is a validation-only pipeline of passes and does not modify or return the module.
+    """
+
+    # Debug
+    # tilelang.analysis.ASTPrinter()(mod)
+    # Check if there are any invalid nested loops.
+    tilelang.analysis.NestedLoopChecker()(mod)
+    # Check if there are any invalid symbolic T.Parallel + fragment access.
+    tilelang.analysis.FragmentLoopChecker()(mod)
+
+
 def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # Bind the target device information to the module
     """
@@ -106,6 +159,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.LayoutReducer()(mod)
     # Infer memory layouts for fragments and shared memory
     mod = tilelang.transform.LayoutInference()(mod)
+    # Visualize the layout
+    LayoutVisual(mod)
     # Lower high-level tile operations to low-level operations
     mod = tilelang.transform.LowerTileOp()(mod)
     # Lower l2 persistent map
@@ -120,6 +175,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # TODO(lei): return to tir pass when kSymbolicBound simplification
     # is merged into tvm.
     mod = tilelang.transform.Simplify()(mod)
+    # Hoist any root-block annotations to PrimFunc attrs if pass is available
+    mod = tilelang.transform.HoistNonRestrictParams()(mod)
     return mod
 
 
@@ -148,7 +205,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.InjectFenceProxy()(mod)
     else:
         mod = tilelang.transform.IfStmtBinding()(mod)
-        mod = tir.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+        mod = tilelang.transform.PlanAndUpdateBufferAllocationLocation()(mod)
         mod = tilelang.transform.PipelinePlanning()(mod)
         mod = tilelang.transform.InjectSoftwarePipeline()(mod)
         mod = tilelang.transform.MergeIfStmt()(mod)
@@ -158,6 +215,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
             mod = tilelang.transform.InjectFenceProxy()(mod)
 
     mod = tilelang.transform.LowerOpaqueBlock()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tir.transform.NarrowDataType(32)(mod)
     mod = tilelang.transform.FlattenBuffer()(mod)
     # ConfigIndexBitwidth must be applied after FlattenBuffer
@@ -194,12 +252,11 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.ThreadSync("global")(mod)
     mod = tilelang.transform.AnnotateDeviceRegions()(mod)
     mod = tilelang.transform.SplitHostDevice()(mod)
+    mod = tilelang.transform.AnnotateReadOnlyParams()(mod)
     # MergeSharedMemoryAllocations must be applied after SplitHostDevice
     # because the merged allocation site is at the beginning of each device function
     enable_aggressive_merge = should_enable_aggressive_merge(pass_ctx=pass_ctx, target=target)
-    mod = tilelang.transform.MergeSharedMemoryAllocations(
-        enable_aggressive_merge=enable_aggressive_merge)(
-            mod)
+    mod = tilelang.transform.MergeSharedMemoryAllocations(enable_aggressive_merge=enable_aggressive_merge)(mod)
     mod = tilelang.transform.ThreadSync("shared")(mod)
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
     # Inject PTX async copy must behind the thread sync pass
@@ -208,6 +265,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     if allow_tma_and_warp_specialized(pass_ctx=pass_ctx, target=target):
         mod = tilelang.transform.AnnotateWarpGroupRegAlloc()(mod)
     mod = tilelang.transform.MakePackedAPI()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tilelang.transform.LowerDeviceKernelLaunch()(mod)
 
     # Transform threadblock to persistent threadblock
diff --git a/tilelang/env.py b/tilelang/env.py
index b98bbf989e21d210352a1f81cc1f72bdac5a66aa..0583cd4cf6c7fd8cc4f8ea3640cefb3bb5bce302 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -10,32 +10,34 @@ from dataclasses import dataclass
 logger = logging.getLogger(__name__)
 
 # SETUP ENVIRONMENT VARIABLES
-CUTLASS_NOT_FOUND_MESSAGE = ("CUTLASS is not installed or found in the expected path")
+CUTLASS_NOT_FOUND_MESSAGE = "CUTLASS is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = (
-    "Composable Kernel is not installed or found in the expected path")
+COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = "Composable Kernel is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TL_TEMPLATE_NOT_FOUND_MESSAGE = ("TileLang is not installed or found in the expected path")
+TL_TEMPLATE_NOT_FOUND_MESSAGE = "TileLang is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TVM_LIBRARY_NOT_FOUND_MESSAGE = ("TVM is not installed or found in the expected path")
+TVM_LIBRARY_NOT_FOUND_MESSAGE = "TVM is not installed or found in the expected path"
 
 TL_ROOT = os.path.dirname(os.path.abspath(__file__))
-TL_LIBS = [TL_ROOT, os.path.join(TL_ROOT, 'lib')]
+# Only expose the internal lib directory to sys.path to avoid shadowing
+# common top-level module names (e.g., utils, analysis) from user projects.
+TL_LIBS = [os.path.join(TL_ROOT, "lib")]
 TL_LIBS = [i for i in TL_LIBS if os.path.exists(i)]
 
 DEV = False
-THIRD_PARTY_ROOT = os.path.join(TL_ROOT, '3rdparty')
+THIRD_PARTY_ROOT = os.path.join(TL_ROOT, "3rdparty")
 if not os.path.exists(THIRD_PARTY_ROOT):
     DEV = True
     tl_dev_root = os.path.dirname(TL_ROOT)
 
-    dev_lib_root = os.path.join(tl_dev_root, 'build')
-    TL_LIBS = [dev_lib_root, os.path.join(dev_lib_root, 'tvm')]
-    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, '3rdparty')
-    logger.warning(f'Loading tilelang libs from dev root: {dev_lib_root}')
+    dev_lib_root = os.path.join(tl_dev_root, "build")
+    # In dev builds, place artifacts under build/lib and point search path there
+    # to avoid adding the entire build root to sys.path.
+    TL_LIBS = [os.path.join(dev_lib_root, "lib"), os.path.join(dev_lib_root, "tvm")]
+    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, "3rdparty")
+    logger.warning(f"Loading tilelang libs from dev root: {dev_lib_root}")
 
-assert TL_LIBS and all(
-    os.path.exists(i) for i in TL_LIBS), f'tilelang lib root do not exists: {TL_LIBS}'
+assert TL_LIBS and all(os.path.exists(i) for i in TL_LIBS), f"tilelang lib root do not exists: {TL_LIBS}"
 
 for lib in TL_LIBS:
     if lib not in sys.path:
@@ -48,7 +50,7 @@ def _find_cuda_home() -> str:
     Adapted from https://github.com/pytorch/pytorch/blob/main/torch/utils/cpp_extension.py
     """
     # Guess #1
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
     if cuda_home is None:
         # Guess #2
         nvcc_path = shutil.which("nvcc")
@@ -66,15 +68,15 @@ def _find_cuda_home() -> str:
 
         else:
             # Guess #3
-            if sys.platform == 'win32':
-                cuda_homes = glob.glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
-                cuda_home = '' if len(cuda_homes) == 0 else cuda_homes[0]
+            if sys.platform == "win32":
+                cuda_homes = glob.glob("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*")
+                cuda_home = "" if len(cuda_homes) == 0 else cuda_homes[0]
             else:
                 # Linux/macOS
-                if os.path.exists('/usr/local/cuda'):
-                    cuda_home = '/usr/local/cuda'
-                elif os.path.exists('/opt/nvidia/hpc_sdk/Linux_x86_64'):
-                    cuda_home = '/opt/nvidia/hpc_sdk/Linux_x86_64'
+                if os.path.exists("/usr/local/cuda"):
+                    cuda_home = "/usr/local/cuda"
+                elif os.path.exists("/opt/nvidia/hpc_sdk/Linux_x86_64"):
+                    cuda_home = "/opt/nvidia/hpc_sdk/Linux_x86_64"
 
             # Validate found path
             if cuda_home is None or not os.path.exists(cuda_home):
@@ -85,13 +87,13 @@ def _find_cuda_home() -> str:
 
 def _find_rocm_home() -> str:
     """Find the ROCM install path."""
-    rocm_home = os.environ.get('ROCM_PATH') or os.environ.get('ROCM_HOME')
+    rocm_home = os.environ.get("ROCM_PATH") or os.environ.get("ROCM_HOME")
     if rocm_home is None:
         rocmcc_path = shutil.which("hipcc")
         if rocmcc_path is not None:
             rocm_home = os.path.dirname(os.path.dirname(rocmcc_path))
         else:
-            rocm_home = '/opt/rocm'
+            rocm_home = "/opt/rocm"
             if not os.path.exists(rocm_home):
                 rocm_home = None
     return rocm_home if rocm_home is not None else ""
@@ -100,6 +102,7 @@ def _find_rocm_home() -> str:
 # Cache control
 class CacheState:
     """Class to manage global kernel caching state."""
+
     _enabled = True
 
     @classmethod
@@ -196,12 +199,6 @@ class EnvVar:
         # os.environ[self.key] = value
 
 
-# Cache control API (wrap CacheState)
-enable_cache = CacheState.enable
-disable_cache = CacheState.disable
-is_cache_enabled = CacheState.is_enabled
-
-
 # Utility function for environment variables with defaults
 # Assuming EnvVar and CacheState are defined elsewhere
 class Environment:
@@ -232,21 +229,21 @@ class Environment:
     TILELANG_TMP_DIR = EnvVar("TILELANG_TMP_DIR", os.path.join(TILELANG_CACHE_DIR.get(), "tmp"))
 
     # Kernel Build options
-    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION",
-                                           "1")  # print kernel name on compile
-    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # clear cache automatically if set
+    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION", "1")  # print kernel name on compile
+    TILELANG_DISABLE_CACHE = EnvVar(
+        "TILELANG_DISABLE_CACHE", "0"
+    )  # disable kernel cache, usually for unit testing / debugging, high priority
+    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # DEPRECATED! clear cache automatically if set
 
     # Kernel selection options
     # Default to GEMM v2; set to "1"/"true"/"yes"/"on" to force v1
     TILELANG_USE_GEMM_V1 = EnvVar("TILELANG_USE_GEMM_V1", "0")
 
     # Auto-tuning settings
-    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES",
-                                                "0.9")  # percent of CPUs used
-    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS",
-                                             "-1")  # -1 means auto
-    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT",
-                                                "-1")  # -1 means no limit
+    TILELANG_AUTO_TUNING_DISABLE_CACHE = EnvVar("TILELANG_AUTO_TUNING_DISABLE_CACHE", "0")
+    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES", "0.9")  # percent of CPUs used
+    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS", "-1")  # -1 means auto
+    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT", "-1")  # -1 means no limit
 
     # TVM integration
     SKIP_LOADING_TILELANG_SO = EnvVar("SKIP_LOADING_TILELANG_SO", "0")
@@ -267,7 +264,7 @@ class Environment:
 
     # Cache control API (wrap CacheState)
     def is_cache_enabled(self) -> bool:
-        return CacheState.is_enabled()
+        return not self.is_cache_globally_disabled() and CacheState.is_enabled()
 
     def enable_cache(self) -> None:
         CacheState.enable()
@@ -275,6 +272,12 @@ class Environment:
     def disable_cache(self) -> None:
         CacheState.disable()
 
+    def is_cache_globally_disabled(self) -> bool:
+        return self.TILELANG_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
+    def is_autotune_cache_disabled(self) -> bool:
+        return self.TILELANG_AUTO_TUNING_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
     def is_print_on_compilation_enabled(self) -> bool:
         return self.TILELANG_PRINT_ON_COMPILATION.lower() in ("1", "true", "yes", "on")
 
@@ -290,6 +293,11 @@ class Environment:
 # Instantiate as a global configuration object
 env = Environment()
 
+# Cache control API (wrap env, which is managed by CacheState and Environment Variables jointly)
+enable_cache = env.enable_cache  # CacheState.enable
+disable_cache = env.disable_cache  # CacheState.disable
+is_cache_enabled = env.is_cache_enabled  # CacheState.is_enabled
+
 # Export CUDA_HOME and ROCM_HOME, both are static variables
 # after initialization.
 CUDA_HOME = env.CUDA_HOME
@@ -309,18 +317,18 @@ def prepend_pythonpath(path):
 if env.TVM_IMPORT_PYTHON_PATH is not None:
     prepend_pythonpath(env.TVM_IMPORT_PYTHON_PATH)
 else:
-    tvm_path = os.path.join(THIRD_PARTY_ROOT, 'tvm', 'python')
+    tvm_path = os.path.join(THIRD_PARTY_ROOT, "tvm", "python")
     assert os.path.exists(tvm_path), tvm_path
     if tvm_path not in sys.path:
         prepend_pythonpath(tvm_path)
         env.TVM_IMPORT_PYTHON_PATH = tvm_path
-
-    if os.environ.get("TVM_LIBRARY_PATH") is None:
-        os.environ['TVM_LIBRARY_PATH'] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
+# By default, the built TVM-related libraries are stored in TL_LIBS.
+if os.environ.get("TVM_LIBRARY_PATH") is None:
+    os.environ["TVM_LIBRARY_PATH"] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
 
 # Initialize CUTLASS paths
 if os.environ.get("TL_CUTLASS_PATH", None) is None:
-    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, 'cutlass', 'include')
+    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, "cutlass", "include")
     if os.path.exists(cutlass_inc_path):
         os.environ["TL_CUTLASS_PATH"] = env.CUTLASS_INCLUDE_DIR = cutlass_inc_path
     else:
@@ -328,7 +336,7 @@ if os.environ.get("TL_CUTLASS_PATH", None) is None:
 
 # Initialize COMPOSABLE_KERNEL paths
 if os.environ.get("TL_COMPOSABLE_KERNEL_PATH", None) is None:
-    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, 'composable_kernel', 'include')
+    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, "composable_kernel", "include")
     if os.path.exists(ck_inc_path):
         os.environ["TL_COMPOSABLE_KERNEL_PATH"] = env.COMPOSABLE_KERNEL_INCLUDE_DIR = ck_inc_path
     else:
diff --git a/tilelang/intrinsics/mfma_layout.py b/tilelang/intrinsics/mfma_layout.py
index 183ba646f9421eb2318242c6989ad41132eeba0d..38959649467cdfd9decd2fd73c3b4c46e8868ea0 100644
--- a/tilelang/intrinsics/mfma_layout.py
+++ b/tilelang/intrinsics/mfma_layout.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 
 
 def shared_16x4_to_local_64x1_layout_A(i, j):
-    thread_id = (j * 16 + i)
+    thread_id = j * 16 + i
     return thread_id, convert(0)
 
 
@@ -15,7 +15,7 @@ def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
 
 
 def shared_4x16_to_local_64x1_layout_B(i, j):
-    thread_id = (i * 16 + j)
+    thread_id = i * 16 + j
     return thread_id, convert(0)
 
 
@@ -27,7 +27,7 @@ def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_C(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -45,7 +45,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_A(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_A(i, j):
     thread_id = i + 16 * (j // 4)
-    local = (j % 4)
+    local = j % 4
     return thread_id, local
 
 
@@ -57,7 +57,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_B(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -87,7 +87,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_A(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_A(i, j):
     thread_id = i + 16 * (j // 8)
-    local = (j % 8)
+    local = j % 8
     return thread_id, local
 
 
@@ -99,7 +99,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_B(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_B(i, j):
     thread_id = j + (i // 8) * 16
-    local = (i % 8)
+    local = i % 8
     return thread_id, local
 
 
@@ -111,7 +111,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_A(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_A(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
@@ -123,7 +123,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_B(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_B(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index 84e4c21b97a0f6d3e4636e1ef59707eebecc66d6..ad219206101368c84314546dbd9b4ce90295efef 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -2,14 +2,15 @@ from __future__ import annotations
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tvm.runtime import convert
-from .utils import (
-    mfma_store_index_map,)
+from .utils import mfma_store_index_map
 from typing import Literal, Callable
 
 from tilelang.utils import is_fragment
-from tilelang.utils.language import to_buffer_region
+from tilelang.utils.language import get_buffer_region_from_load
 from .mfma_layout import (
     shared_16x4_to_local_64x1_layout_A,
     shared_4x16_to_local_64x1_layout_B,
@@ -60,9 +61,9 @@ class MatrixCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -100,13 +101,13 @@ class MatrixCoreIntrinEmitter:
         self.warp_rows = warp_row_tiles // self.micro_size_x
         self.warp_cols = warp_col_tiles // self.micro_size_y
         self.reduce_k = reduce_k
-        self.threads = (self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k)
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
         self.num_elems_per_byte = num_elems_per_byte
         self.thread_var = thread_var
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
-            if a_dtype in ["float8_e4m3fnuz", "int8"]:
+            if a_dtype in ["float8_e4m3fnuz", T.int8]:
                 self.k_dim = 32
                 return
             a_dtype = DataType(a_dtype)
@@ -131,12 +132,7 @@ class MatrixCoreIntrinEmitter:
     def _initialize_mfma_prefix(self, k_dim=16):
         in_dtype, out_dtype = self.a_dtype, self.accum_dtype
         M_DIM, N_DIM = self.M_DIM, self.N_DIM
-        out_dtype_abbrv = {
-            "float16": "f16",
-            "float32": "f32",
-            "int8": "i8",
-            "int32": "i32"
-        }[out_dtype]
+        out_dtype_abbrv = {T.float16: "f16", T.float32: "f32", T.int8: "i8", T.int32: "i32"}[out_dtype]
 
         in_dtype_abbrv = {
             "bfloat16": "bf16",
@@ -175,7 +171,6 @@ class MatrixCoreIntrinEmitter:
             self.b_preshuffle = b_preshuffle
 
     def get_ldmatrix_index_map(self, is_b=False):
-
         k_dim = self.k_dim * self.k_pack
         transposed = self.a_transposed if not is_b else self.b_transposed
         if k_dim == 4:
@@ -183,28 +178,42 @@ class MatrixCoreIntrinEmitter:
             reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A
             if is_b:
                 index_map = shared_16x4_to_local_64x1_layout_A if transposed else shared_4x16_to_local_64x1_layout_B
-                reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                )
         elif k_dim == 16:
             index_map = shared_16x16_to_local_64x4_layout_B if transposed else shared_16x16_to_local_64x4_layout_A
-            reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x16_to_local_64x4_layout_A if transposed else shared_16x16_to_local_64x4_layout_B
-                reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                )
         elif k_dim == 32:
             index_map = shared_16x32_to_local_64x8_layout_B if transposed else shared_16x32_to_local_64x8_layout_A
-            reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x32_to_local_64x8_layout_A if transposed else shared_16x32_to_local_64x8_layout_B
-                reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                )
         elif k_dim == 64:
             index_map = shared_16x64_to_local_64x16_layout_B if transposed else shared_16x64_to_local_64x16_layout_A
-            reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x64_to_local_64x16_layout_A if transposed else shared_16x64_to_local_64x16_layout_B
-                reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                )
         else:
             raise ValueError("k_dim must be 4 or 16 or 32 or 64 currently")
 
@@ -212,7 +221,7 @@ class MatrixCoreIntrinEmitter:
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
-        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype="int32")
+        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
@@ -226,14 +235,12 @@ class MatrixCoreIntrinEmitter:
         else:
             return self.thread_var
 
-    def extract_thread_binding(self,
-                               thread_id,
-                               is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
-        '''
-            is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
-            which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
-            Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
-        '''
+    def extract_thread_binding(self, thread_id, is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
         WARP_SIZE = self.WARP_SIZE
         block_row_warps = self.block_row_warps
         block_col_warps = self.block_col_warps
@@ -243,16 +250,18 @@ class MatrixCoreIntrinEmitter:
             is_m_first = self.is_m_first
 
         if is_m_first:
-            lane_id, warp_n, warp_m = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_col_warps, (thread_id //
-                                               (WARP_SIZE * block_col_warps)) % block_row_warps,
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
             return lane_id, warp_n, warp_m
         else:
-            lane_id, warp_m, warp_n = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_row_warps, (thread_id //
-                                               (WARP_SIZE * block_row_warps)) % block_col_warps,
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
             return lane_id, warp_n, warp_m
 
     def ldmatrix_a(self, A_local_buf, A_shared_buf: Buffer | BufferRegion, ki, rk=0):
@@ -268,7 +277,7 @@ class MatrixCoreIntrinEmitter:
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=False)
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -286,18 +295,14 @@ class MatrixCoreIntrinEmitter:
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (rk * chunk + ki * (k_pack * micro_size_k),
-                                warp_m * warp_row_tiles + i * micro_size_x)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row,
-                                                                                  A_base1 + r + col]
+                        l, r = (rk * chunk + ki * (k_pack * micro_size_k), warp_m * warp_row_tiles + i * micro_size_x)
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
             else:
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (warp_m * warp_row_tiles + i * micro_size_x,
-                                rk * chunk + ki * (k_pack * micro_size_k))
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row,
-                                                                                  A_base1 + r + col]
+                        l, r = (warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * (k_pack * micro_size_k))
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
 
         return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
 
@@ -314,7 +319,7 @@ class MatrixCoreIntrinEmitter:
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=True)
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
@@ -336,8 +341,7 @@ class MatrixCoreIntrinEmitter:
                             warp_n * warp_col_tiles + j * micro_size_y,
                             rk * chunk + ki * (k_pack * micro_size_k),
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row,
-                                                                                  B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
             else:
                 for j in T.serial(warp_cols):
@@ -347,16 +351,11 @@ class MatrixCoreIntrinEmitter:
                             rk * chunk + ki * (k_pack * micro_size_k),
                             warp_n * warp_col_tiles + j * micro_size_y,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row,
-                                                                                  B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mfma(self,
-             A_local_buf: Buffer,
-             B_local_buf: Buffer,
-             C_local_buf: Buffer,
-             k_inner: PrimExpr | None = 0):
+    def mfma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -371,8 +370,8 @@ class MatrixCoreIntrinEmitter:
 
         a_is_fragment = is_fragment(A_local_buf)
         b_is_fragment = is_fragment(B_local_buf)
-        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
-        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
+        a_local_stride: PrimExpr = k_inner * warp_rows * k_pack * local_size_a if a_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * k_pack * local_size_b if b_is_fragment else 0
 
         @T.macro
         def _warp_mfma(A_local_buf, B_local_buf, C_local_buf):
@@ -420,14 +419,13 @@ class MatrixCoreIntrinEmitter:
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
                     if C_buf_dims == 2:
-                        C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                              (warp_n * warp_cols + j) * N_DIM +
-                              col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                 j * local_size_out + local_id]
+                        C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * N_DIM + col] = C_local_buf[
+                            i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                        ]
                     else:
-                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                              col] = C_local_buf[i * warp_cols * local_size_out +
-                                                 j * local_size_out + local_id]
+                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                            i * warp_cols * local_size_out + j * local_size_out + local_id
+                        ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -435,18 +433,17 @@ class MatrixCoreIntrinEmitter:
             for i, j in T.grid(warp_rows, warp_cols):
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
-                    C_buf[(pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
-                          (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM +
-                          col] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                             local_id]
-
-        return _warp_stmatrix_global(C_local_buf, C_buf,
-                                     thread_binding) if is_global else _warp_stmatrix_shared(
-                                         C_local_buf, C_buf, thread_binding)
-
-    def make_mfma_load_layout(self,
-                              local_buf: Buffer,
-                              matrix: Literal["A", "B"] = "A") -> T.Fragment:
+                    C_buf[
+                        (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row, (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM + col
+                    ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
+
+    def make_mfma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MFMA results into a fragment buffer.
 
@@ -467,6 +464,7 @@ class MatrixCoreIntrinEmitter:
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -505,11 +503,9 @@ class MatrixCoreIntrinEmitter:
 
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -525,7 +521,7 @@ class MatrixCoreIntrinEmitter:
             self.block_col_warps,
         )
 
-        inverse_mfma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mfma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -542,7 +538,7 @@ class MatrixCoreIntrinEmitter:
             return local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s],
+            [micro_size_s, micro_size_r * self.k_pack] if is_sr_axis_order else [micro_size_r * self.k_pack, micro_size_s],
             forward_thread_fn=forward_thread,
             forward_index_fn=forward_index,
         )
@@ -551,36 +547,24 @@ class MatrixCoreIntrinEmitter:
         chunk = self.chunk
 
         warp_s = warp_rows if matrix_is_a else warp_cols
-        warp_r = chunk // micro_size_r
+        warp_r = chunk // (micro_size_r * self.k_pack)
         block_s = block_row_warps if matrix_is_a else block_col_warps
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -655,14 +639,40 @@ class MatrixCoreIntrinEmitter:
             forward_index_fn=forward_index,
         )
 
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
 
-class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
 
+class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -763,20 +773,20 @@ class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
                             rk * (chunk // micro_size_k) + ki,
                             warp_m * warp_rows + i,
                         )
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
             else:
                 print(self.a_preshuffle)
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
                         l, r = (warp_m * warp_rows + i, rk * (chunk // micro_size_k) + ki)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_a_shared(
-                                           A_local_buf, A_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_a_shared(A_local_buf, A_buf, ki, thread_binding, rk)
+        )
 
     def ldmatrix_b(self, B_local_buf, B_buf, ki, rk=0, pid_m=None, pid_n=None):
         warp_cols = self.warp_cols
@@ -838,8 +848,7 @@ class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
             else:
                 for j in T.serial(warp_cols):
                     for local_id in T.vectorized(k_pack * local_size_b):
@@ -848,9 +857,10 @@ class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
                             rk * (chunk // micro_size_k) + ki,
                             warp_n * warp_cols + j,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_b_shared(
-                                           B_local_buf, B_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_b_shared(B_local_buf, B_buf, ki, thread_binding, rk)
+        )
diff --git a/tilelang/intrinsics/mma_layout.py b/tilelang/intrinsics/mma_layout.py
index 449b6b9430dcffe537c3c0f13dba5daf136c41b7..2eb575f0ca1fc3e070e3e9402439dcfdd936131e 100644
--- a/tilelang/intrinsics/mma_layout.py
+++ b/tilelang/intrinsics/mma_layout.py
@@ -151,12 +151,43 @@ def mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id):
     return row, col
 
 
+def mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
+
+    row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6
+            groupID + 8         Otherwise
+
+    col =  (threadID_in_group * 2) + (i & 0x1)          for ai where i <  4
+    (threadID_in_group * 2) + (i & 0x1) + 8      for ai where i >= 4
+    """
+    row = (thread_id // 4) + 8 * (local_id % 4 // 2)
+    col = (thread_id % 4) * 2 + (local_id % 2) + 8 * (local_id // 4)
+    return row, col
+
+
 def mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
     row = 8 * (local_id // 8) + (thread_id // 4)
     col = 16 * (local_id % 8 // 4) + (thread_id % 4) * 4 + (local_id % 4)
     return row, col
 
 
+def mma_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
+
+    row =  (threadID_in_group * 2) + (i & 0x1)           for bi where i <  2
+        (threadID_in_group * 2) + (i & 0x1) + 8       for bi where i >= 2
+
+    col = groupID
+    """
+    col = (thread_id % 4) * 2 + ((local_id % 4) % 2) + ((local_id % 4) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 4)
+    return row, col
+
+
 def shared_16x16_to_mma_32x8_smoothlayout(i, j):
     return (i * 2 + j // 8, j % 8)
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 8c546c63b9079561312e9135c4896c7d08cae6ec..4b41eef2af58e515d48b80d9a81adbadc0867f19 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -3,14 +3,16 @@ import tilelang.language as T
 from typing import Literal, Callable
 from tilelang.common import TransformKind
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tilelang import tvm as tvm
 from tvm.runtime import convert
 from .utils import (
     mma_store_index_map,
     get_ldmatrix_offset,
 )
-from tilelang.utils import is_fragment, to_buffer_region
+from tilelang.utils import is_fragment, get_buffer_region_from_load
 from tilelang.intrinsics.mma_layout import (
     shared_16x8_to_mma_32x4_layout_sr_a,
     shared_16x8_to_mma_32x4_layout_sr_b,
@@ -20,8 +22,10 @@ from tilelang.intrinsics.mma_layout import (
     shared_16x32_to_mma_32x16_layout_sr_b,
     mma_load_a_32x4_to_shared_16x8_layout,
     mma_load_b_32x4_to_shared_16x8_layout,
+    mma_load_b_32x8_to_shared_16x16_layout,
     mma_load_a_32x16_to_shared_16x32_layout,
     mma_load_b_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
 )
 
 lift = convert
@@ -45,7 +49,10 @@ class TensorCoreIntrinEmitter:
         "int8": "int8",
         "int32": "int32",
         "float8_e4m3": "e4m3",
+        "float8_e4m3fn": "e4m3",
+        "float8_e4m3fnuz": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e5m2fnuz": "e5m2",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
@@ -53,9 +60,9 @@ class TensorCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -101,7 +108,7 @@ class TensorCoreIntrinEmitter:
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
             a_dtype = DataType(a_dtype)
         self.k_dim = 256 // a_dtype.bits
@@ -184,20 +191,18 @@ class TensorCoreIntrinEmitter:
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         from .utils import mma_store_index_map, mma_store_index_map_fp64
+
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         if DataType(self.accum_dtype).bits == 64:
-            index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype="int32")
+            index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype=T.int32)
         else:
-            index_map = IndexMap.from_func(mma_store_index_map, index_dtype="int32")
+            index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -226,11 +231,7 @@ class TensorCoreIntrinEmitter:
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self,
-                   A_local_buf: Buffer,
-                   A_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
         if DataType(self.a_dtype).bits == 64:
             warp_row_tiles = self.warp_row_tiles
@@ -243,7 +244,7 @@ class TensorCoreIntrinEmitter:
 
             thread_binding = self.get_thread_binding()
             # legalize shared buffer to region
-            A_region = to_buffer_region(A_shared_buf)
+            A_region = self._legalize_to_buffer_region(A_shared_buf)
             A_buf = A_region.buffer
             A_base0 = A_region.region[-2].min
             A_base1 = A_region.region[-1].min
@@ -286,6 +287,8 @@ class TensorCoreIntrinEmitter:
         if not ldmatrix_available:
             if DataType(a_dtype).bits == 8:
                 mma_load_layout = mma_load_a_32x16_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_load_a_32x8_to_shared_16x16_layout
             elif DataType(a_dtype).bits == 32:
                 mma_load_layout = mma_load_a_32x4_to_shared_16x8_layout
             else:
@@ -294,7 +297,7 @@ class TensorCoreIntrinEmitter:
         thread_binding = self.get_thread_binding()
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -315,9 +318,7 @@ class TensorCoreIntrinEmitter:
             for i in T.serial(warp_rows):
                 # Assign A_shared_buf_elem
                 wi, wk = warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * micro_size_k
-                A_shared_buf_elem = A_buf[A_base0 + wk,
-                                          A_base1 + wi] if a_transposed else A_buf[A_base0 + wi,
-                                                                                   A_base1 + wk]
+                A_shared_buf_elem = A_buf[A_base0 + wk, A_base1 + wi] if a_transposed else A_buf[A_base0 + wi, A_base1 + wk]
 
                 if ldmatrix_available:
                     T.ptx_ldmatrix(
@@ -334,20 +335,13 @@ class TensorCoreIntrinEmitter:
                     for j in T.serial(local_size_a):
                         mi, mk = mma_load_layout(tx, j)
                         if a_transposed:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wk + mk,
-                                                                      A_base1 + wi + mi]
+                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wk + mk, A_base1 + wi + mi]
                         else:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi,
-                                                                      A_base1 + wk + mk]
+                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
 
         return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self,
-                   B_local_buf: Buffer,
-                   B_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
-
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
         if DataType(self.b_dtype).bits == 64:
             warp_col_tiles = self.warp_col_tiles
@@ -360,7 +354,7 @@ class TensorCoreIntrinEmitter:
             thread_binding = self.get_thread_binding()
 
             # legalize shared buffer to region
-            B_region = to_buffer_region(B_shared_buf)
+            B_region = self._legalize_to_buffer_region(B_shared_buf)
             B_buf = B_region.buffer
             B_base0 = B_region.region[-2].min
             B_base1 = B_region.region[-1].min
@@ -397,12 +391,12 @@ class TensorCoreIntrinEmitter:
         thread_binding = self.get_thread_binding()
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
         B_stride_last = B_buf.shape[-1]
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
         # ldmatrix cannot be used for int8 + trans case.
         ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
 
@@ -412,6 +406,8 @@ class TensorCoreIntrinEmitter:
         if not ldmatrix_available:
             if DataType(b_dtype).bits == 8:
                 mma_load_layout = mma_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_load_b_32x8_to_shared_16x16_layout
             elif DataType(b_dtype).bits == 32:
                 mma_load_layout = mma_load_b_32x4_to_shared_16x8_layout
             else:
@@ -437,9 +433,7 @@ class TensorCoreIntrinEmitter:
                 )
 
                 if ldmatrix_available:
-                    B_shared_buf_elem = B_buf[B_base0 + wi,
-                                              B_base1 + wk] if b_transposed else B_buf[B_base0 + wk,
-                                                                                       B_base1 + wi]
+                    B_shared_buf_elem = B_buf[B_base0 + wi, B_base1 + wk] if b_transposed else B_buf[B_base0 + wk, B_base1 + wi]
 
                     T.ptx_ldmatrix(
                         b_dtype,
@@ -458,19 +452,13 @@ class TensorCoreIntrinEmitter:
                     for j in T.serial(local_size_b):
                         mi, mk = mma_load_layout(tx, j)
                         if b_transposed:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi,
-                                                                      B_base1 + wk + mk]
+                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
                         else:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk,
-                                                                      B_base1 + wi + mi]
+                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mma(self,
-            A_local_buf: Buffer,
-            B_local_buf: Buffer,
-            C_local_buf: Buffer,
-            k_inner: PrimExpr | None = 0):
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -481,7 +469,7 @@ class TensorCoreIntrinEmitter:
         accum_dtype = self.accum_dtype
         accum_dtype_abbrv = self.accum_dtype_abbrv
         mma_prefix = self.mma_prefix
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
 
         a_is_fragment = is_fragment(A_local_buf)
         b_is_fragment = is_fragment(B_local_buf)
@@ -521,8 +509,7 @@ class TensorCoreIntrinEmitter:
                         B_local_buf.data,
                         b_local_stride + j * local_size_b + lift(local_size_b) // 2,
                         C_local_buf.data,
-                        i * warp_cols * local_size_out + j * local_size_out +
-                        lift(local_size_out) // 2,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
                         T.bool(False),  # saturate
                     )
 
@@ -557,14 +544,13 @@ class TensorCoreIntrinEmitter:
                         local_id = local_id_o * 2 + local_id_i
                         row, col = T.meta_var(mma_store_index_map(tx, local_id))
                         if C_buf_dims == 2:
-                            C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                                  (warp_n * warp_cols + j) * n_dim +
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
                         else:
-                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -577,15 +563,15 @@ class TensorCoreIntrinEmitter:
                         C_buf[
                             (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
                             (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
-                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                        local_id]
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
 
-        return (_warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
-                if is_global else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding))
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -608,6 +594,7 @@ class TensorCoreIntrinEmitter:
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -644,11 +631,9 @@ class TensorCoreIntrinEmitter:
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -664,7 +649,7 @@ class TensorCoreIntrinEmitter:
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -695,31 +680,19 @@ class TensorCoreIntrinEmitter:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -750,8 +723,7 @@ class TensorCoreIntrinEmitter:
         from tilelang.utils import is_fragment
 
         shape = local_buf.shape
-        assert is_fragment(
-            local_buf), f"local_buf {local_buf} must be a fragment, but got {local_buf.scope()}"
+        assert is_fragment(local_buf), f"local_buf {local_buf} must be a fragment, but got {local_buf.scope()}"
         inverse_mma_store_layout = self.get_store_index_map(inverse=True)
 
         micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
@@ -798,6 +770,33 @@ class TensorCoreIntrinEmitter:
             forward_index_fn=forward_index,
         )
 
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+
 
 class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
     """
@@ -807,9 +806,9 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -840,7 +839,7 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
         )
         self._initialize_transform_kind(transform_kind_a, transform_kind_b)
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         self.k_dim = 256 // DataType(a_dtype).bits
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
@@ -916,10 +915,12 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
                         ".b16",
                         A_local_buf.data,
                         i * local_size_a,
-                        T.address_of(A_shared_buf[
-                            warp_m * warp_row_tiles + i * micro_size_x,
-                            rk * chunk + ki * micro_size_k,
-                        ]),
+                        T.address_of(
+                            A_shared_buf[
+                                warp_m * warp_row_tiles + i * micro_size_x,
+                                rk * chunk + ki * micro_size_k,
+                            ]
+                        ),
                         get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
                     )
             elif transform_kind_a == TransformKind.InterWarpTransform:
@@ -981,10 +982,8 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
                             warp_m * warp_rows + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_a +
-                                    local_id) // micro_size_k, (tx * local_size_a + local_id) % (
-                                        micro_size_k)
-                        A_local_buf[j * local_size_a + local_id] = (A_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (tx * local_size_a + local_id) // micro_size_k, (tx * local_size_a + local_id) % (micro_size_k)
+                        A_local_buf[j * local_size_a + local_id] = A_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input A")
 
@@ -1093,12 +1092,11 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_dequantize +
-                                    local_id) // (micro_size_k // num_elems_per_byte), (
-                                        tx * local_size_dequantize + local_id) % (
-                                            micro_size_k // num_elems_per_byte)
-                        B_local_buf[j * local_size_dequantize + local_id] = (
-                            B_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (
+                            (tx * local_size_dequantize + local_id) // (micro_size_k // num_elems_per_byte),
+                            (tx * local_size_dequantize + local_id) % (micro_size_k // num_elems_per_byte),
+                        )
+                        B_local_buf[j * local_size_dequantize + local_id] = B_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input B")
 
@@ -1157,7 +1155,6 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
 
 
 class INT4TensorCoreIntrinEmitter(TensorCoreIntrinEmitter):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
@@ -1260,9 +1257,7 @@ class INT4TensorCoreIntrinEmitter(TensorCoreIntrinEmitter):
 
 
 class INT4TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitterWithLadderTransform):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
-
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -1271,7 +1266,7 @@ class INT4TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitterWith
         a_dtype_abbrv = "int4"
         b_dtype_abbrv = "int4"
         accum_dtype = self.accum_dtype
-        accum_dtype_abbrv = "int32"
+        accum_dtype_abbrv = T.int32
         mma_prefix = "m16n8k32"
 
         @T.macro
diff --git a/tilelang/intrinsics/mma_sm70_layout.py b/tilelang/intrinsics/mma_sm70_layout.py
index d6491c2bd99d8d6936690d7da9c00a31925bf90f..8029234414710af6e923b9d68e0df9a4fc9f4801 100644
--- a/tilelang/intrinsics/mma_sm70_layout.py
+++ b/tilelang/intrinsics/mma_sm70_layout.py
@@ -1,6 +1,3 @@
-from __future__ import annotations
-
-
 def shared_16x4_to_mma_a_32x4_layout(row, col, rep):
     tid = (row % 4) + 16 * ((row // 4) % 2) + 4 * (row // 8) + 8 * rep
     local_id = col
@@ -20,10 +17,8 @@ def shared_16x4_to_mma_b_32x4_layout_trans(row, col, rep):
 
 
 def mma_32x8_to_shared_16x16_layout_fp32(thread_id, local_id):
-    row = (thread_id % 2) + (
-        (local_id // 2 % 2) * 2) + 4 * (thread_id // 16) + (thread_id % 16 // 4) % 2 * 8
-    col = (thread_id % 4 // 2) * 2 + (thread_id % 16 // 8) * 4 + (local_id %
-                                                                  2) + (local_id // 4) * 8
+    row = (thread_id % 2) + ((local_id // 2 % 2) * 2) + 4 * (thread_id // 16) + (thread_id % 16 // 4) % 2 * 8
+    col = (thread_id % 4 // 2) * 2 + (thread_id % 16 // 8) * 4 + (local_id % 2) + (local_id // 4) * 8
     return row, col
 
 
@@ -34,7 +29,7 @@ def mma_32x8_to_shared_16x16_layout_fp16(thread_id, local_id):
 
 
 def mma_load_a_32x4_to_shared_16x4_layout(thread_id, local_id):
-    row = (thread_id % 4) + (4 * (((thread_id // 16 + thread_id % 16 // 4 * 2)) % 4))
+    row = (thread_id % 4) + (4 * ((thread_id // 16 + thread_id % 16 // 4 * 2) % 4))
     col = local_id
     return row, col
 
diff --git a/tilelang/intrinsics/mma_sm70_macro_generator.py b/tilelang/intrinsics/mma_sm70_macro_generator.py
index b20a6a900f66f350be151a3b67d1914b226c7f79..6acc40a4cd56869493bb74465de7e91dd64e7ba4 100644
--- a/tilelang/intrinsics/mma_sm70_macro_generator.py
+++ b/tilelang/intrinsics/mma_sm70_macro_generator.py
@@ -5,7 +5,7 @@ from tvm import DataType
 from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
 from tilelang import tvm as tvm
 from tvm.runtime import convert
-from tilelang.utils import is_fragment, to_buffer_region
+from tilelang.utils import is_fragment
 from tilelang.intrinsics.mma_sm70_layout import (
     shared_16x4_to_mma_a_32x4_layout,
     shared_4x16_to_mma_b_32x4_layout,
@@ -46,9 +46,9 @@ class TensorCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -89,7 +89,7 @@ class TensorCoreIntrinEmitter:
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         self.k_dim = 4
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16):
@@ -147,18 +147,15 @@ class TensorCoreIntrinEmitter:
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         index_map = IndexMap.from_func(
-            mma_32x8_to_shared_16x16_layout_fp32
-            if self.accum_dtype == "float32" else mma_32x8_to_shared_16x16_layout_fp16,
-            index_dtype="int32")
+            mma_32x8_to_shared_16x16_layout_fp32 if self.accum_dtype == T.float32 else mma_32x8_to_shared_16x16_layout_fp16,
+            index_dtype=T.int32,
+        )
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -187,11 +184,7 @@ class TensorCoreIntrinEmitter:
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self,
-                   A_local_buf: Buffer,
-                   A_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         warp_row_tiles = self.warp_row_tiles
         warp_rows = self.warp_rows
         chunk = self.chunk
@@ -207,7 +200,7 @@ class TensorCoreIntrinEmitter:
         mma_load_layout = mma_load_a_32x4_to_shared_16x4_layout
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -231,11 +224,7 @@ class TensorCoreIntrinEmitter:
 
         return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self,
-                   B_local_buf: Buffer,
-                   B_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         warp_col_tiles = self.warp_col_tiles
         warp_cols = self.warp_cols
         chunk = self.chunk
@@ -248,7 +237,7 @@ class TensorCoreIntrinEmitter:
         mma_load_layout = mma_load_b_32x4_to_shared_16x4_layout_trans if b_transposed else mma_load_b_32x4_to_shared_4x16_layout
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
@@ -274,20 +263,14 @@ class TensorCoreIntrinEmitter:
                 for j in T.vectorized(local_size_b):
                     if b_transposed:
                         mi, mk = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi,
-                                                                  B_base1 + wk + mk]
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
                     else:
                         mk, mi = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk,
-                                                                  B_base1 + wi + mi]
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_region, ki, thread_binding, rk)
 
-    def mma(self,
-            A_local_buf: Buffer,
-            B_local_buf: Buffer,
-            C_local_buf: Buffer,
-            k_inner: PrimExpr | None = 0):
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -326,9 +309,7 @@ class TensorCoreIntrinEmitter:
 
         return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -351,6 +332,7 @@ class TensorCoreIntrinEmitter:
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -383,11 +365,9 @@ class TensorCoreIntrinEmitter:
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_rs_b(
-                i, j)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_rs_b(i, j)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -403,7 +383,7 @@ class TensorCoreIntrinEmitter:
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward(i: int, j: int, rep: int) -> int:
             """
@@ -413,9 +393,8 @@ class TensorCoreIntrinEmitter:
             return lane_id, local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s],
-            forward_fn=forward,
-            replicate=2)
+            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s], forward_fn=forward, replicate=2
+        )
 
         warp_rows, warp_cols = self.warp_rows, self.warp_cols
         chunk = self.chunk
@@ -426,31 +405,19 @@ class TensorCoreIntrinEmitter:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
diff --git a/tilelang/intrinsics/mma_sp_layout.py b/tilelang/intrinsics/mma_sp_layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..58034e7fdba90cf6bc8408db6e3ad8c10ce0661a
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_layout.py
@@ -0,0 +1,181 @@
+from tvm import DataType
+from typing import Literal
+
+from tilelang.intrinsics.mma_layout import (
+    mma_load_a_32x4_to_shared_16x8_layout,
+    mma_load_a_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
+
+
+def shared_16x16_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x8_to_mma_32x4_layout_sr_a(i, j)
+
+
+def shared_16x16_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 4)
+    return thread_id, 4 * (i // 8) + (j // 4)
+
+
+def shared_16x32_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x16_to_mma_32x8_layout_sr_a(i, j)
+
+
+def shared_16x32_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 8 * (i // 8) + (j // 8) * 2 + (j % 2)
+
+
+def shared_16x64_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x32_to_mma_32x16_layout_sr_a(i, j)
+
+
+def shared_16x64_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 16) // 4
+    return thread_id, 16 * (i // 8) + (j // 16) * 4 + j % 4
+
+
+def mma_sp_load_a_32x4_to_shared_16x16_layout(thread_id, local_id):
+    return mma_load_a_32x4_to_shared_16x8_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x8_to_shared_16x32_layout(thread_id, local_id):
+    return mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x16_to_shared_16x64_layout(thread_id, local_id):
+    return mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id)
+
+
+def mma_sp_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    col = 4 * (local_id % 4) + (thread_id % 4)
+    row = 8 * (local_id // 4) + (thread_id // 4)
+    return row, col
+
+
+def mma_sp_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
+    col = (thread_id % 4) * 2 + (local_id % 2) + ((local_id % 8) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 8)
+    return row, col
+
+
+def mma_sp_load_b_32x32_to_shared_16x64_layout(thread_id, local_id):
+    col = (thread_id % 4) * 4 + (local_id % 4) + 16 * ((local_id % 16) // 4)
+    row = (thread_id // 4) + 8 * (local_id // 16)
+    return row, col
+
+
+def get_logical_id_32bit(thread_id: int) -> int:
+    return (thread_id // 4) * 2 + (thread_id % 4) % 2
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 4 + local_id * 8
+    col = logical_id % 4
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = logical_id % 2
+    return row, col
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def get_logical_id_8bit(thread_id: int) -> int:
+    return thread_id
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 4 + local_id
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 2 + local_id
+    return row, col
+
+
+def metadata_32bit_load_32x1_to_shared_16x2_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    # local_id is always 0
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 4 + (logical_id % 2) * 8
+    col = (logical_id % 4) // 2
+    return row, col
+
+
+def ldmatrix_trans_32x8_to_shared_16x16_layout(thread_id, local_id):
+    row = (local_id // 4) * 8 + thread_id % 8
+    col = (thread_id // 8) * 4 + local_id % 4
+    return row, col
+
+
+def ldmatrix_32x16_to_shared_32x16_layout(thread_id, local_id):
+    row = thread_id
+    col = local_id % 8 + 8 * (local_id // 8)
+    return row, col
+
+
+def ldmatrix_trans_32x16_to_shared_16x32_layout(thread_id, local_id):
+    row = 8 * (local_id // 8) + thread_id % 8
+    col = (thread_id // 8) * 8 + local_id % 8
+    return row, col
+
+
+def ldmatrix_trans_32x32_to_shared_shared_16x64_layout(thread_id, local_id):
+    row = (local_id // 16) * 8 + thread_id % 8
+    col = (thread_id // 8) * 16 + local_id % 16
+    return row, col
+
+
+def get_ldmatrix_offset_b(
+    matrix: Literal["B"],
+    row_idx,
+    col_idx,
+    stride,
+    dtype: Literal["float16", "int8"] = "float16",
+    transposed: bool = False,
+):
+    assert matrix == "B", "matrix should be B"
+    dtype_bits = DataType(dtype).bits
+    if dtype_bits == 32:
+        if transposed:
+            transform_func = ldmatrix_trans_32x8_to_shared_16x16_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 32-bit dtype")
+    elif dtype_bits == 16:
+        transform_func = ldmatrix_32x16_to_shared_32x16_layout
+        transform_func_trans = ldmatrix_trans_32x16_to_shared_16x32_layout
+        if transposed:
+            new_row_idx, new_col_idx = transform_func_trans(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+    elif dtype_bits == 8:
+        if transposed:
+            transform_func = ldmatrix_trans_32x32_to_shared_shared_16x64_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 8-bit dtype")
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e375b46b97547916e2aaf7efd56c3926b7af5b4
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -0,0 +1,831 @@
+from __future__ import annotations
+
+import tilelang.language as T
+from typing import Literal, Callable
+from tvm import DataType
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var
+from tvm.runtime import convert
+from .utils import (
+    mma_store_index_map,
+    get_ldmatrix_offset,
+)
+from tilelang.utils import is_fragment
+
+from tilelang.intrinsics.mma_sp_layout import (
+    shared_16x16_to_mma_sp_layout_sr_a,
+    shared_16x16_to_mma_sp_layout_sr_b,
+    shared_16x32_to_mma_sp_layout_sr_a,
+    shared_16x32_to_mma_sp_layout_sr_b,
+    shared_16x64_to_mma_sp_layout_sr_a,
+    shared_16x64_to_mma_sp_layout_sr_b,
+    mma_sp_load_a_32x4_to_shared_16x16_layout,
+    mma_sp_load_a_32x8_to_shared_16x32_layout,
+    mma_sp_load_a_32x16_to_shared_16x64_layout,
+    mma_sp_load_b_32x8_to_shared_16x16_layout,
+    mma_sp_load_b_32x16_to_shared_16x32_layout,
+    mma_sp_load_b_32x32_to_shared_16x64_layout,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_32bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_32bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_16bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_16bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_8bit,
+    metadata_16bit_load_32x2_to_shared_16x4_layout_8bit,
+    metadata_32bit_load_32x1_to_shared_16x2_layout_8bit,
+    get_ldmatrix_offset_b,
+)
+
+lift = convert
+
+
+class SparseTensorCoreIntrinEmitter:
+    """
+    To eliminate Python syntax within TIR Macro.
+    """
+
+    M_DIM = 16
+    SPARSE_FACTOR = 2  # 1:2 for tfloat12, 2:4 for 16-bit and 8-bit datatypes
+    SPARSE_SELECTOR = 0  # always use lower threads to provide metadata
+    # use lowercase as n_dim can be dynamic
+    # the smallest instructions can be m16n8k16, so the n_dim can also be 8
+    n_dim = 16
+    WARP_SIZE = 32
+    dtype_abbrv = {
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "int8": "int8",
+        "int32": "int32",
+        "float8_e4m3": "e4m3",
+        "float8_e5m2": "e5m2",
+    }
+
+    E_FACTOR_MAP = {  # e_kdim = mma_kdim // e_factor
+        "float": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float32": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "bfloat16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "int8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "uint8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e4m3": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e5m2": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+    }
+
+    E_REPLICATE_FACTOR = {  # metadata replicate every 4 consecutive threads
+        "float32": 2,
+        "float16": 2,  # 2 of 4 consecutive threads provides
+        "bfloat16": 2,
+        "int8": 1,  # 4 of 4 consecutive threads provides
+        "uint8": 1,
+        "float8_e4m3": 1,
+        "float8_e5m2": 1,
+    }
+
+    # Represent the thread binding in the form of (tx, warp_n, warp_m)
+    is_m_first = False
+
+    def __init__(
+        self,
+        a_dtype: str = T.float16,
+        e_dtype: str = T.uint8,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
+        a_transposed: bool = False,
+        b_transposed: bool = False,
+        e_transposed: bool = False,
+        block_row_warps: int = 2,
+        block_col_warps: int = 2,
+        warp_row_tiles: int = 8,
+        warp_col_tiles: int = 8,
+        warp_k: int = 16,
+        reduce_k: int = 1,
+        num_elems_per_byte: int = 1,
+        is_m_first: bool = False,
+        thread_var: Var | None = None,
+    ):
+        self.a_dtype = a_dtype
+        self.e_dtype = e_dtype
+        self.b_dtype = b_dtype
+        self.accum_dtype = accum_dtype
+        self.a_transposed = a_transposed
+        self.b_transposed = b_transposed
+        self.e_transposed = e_transposed
+        # Hint Information
+        self.block_row_warps = block_row_warps
+        self.block_col_warps = block_col_warps
+        self.warp_row_tiles = warp_row_tiles
+        self.warp_col_tiles = warp_col_tiles
+        self.warp_k = warp_k
+        self.e_factor = self.E_FACTOR_MAP[self.a_dtype][self.e_dtype]
+        self._initialize_k_dim(a_dtype)
+        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
+        self._initialize_micro_size(self.M_DIM, self.k_dim)
+        self._initialize_local_size(self.M_DIM, self.n_dim, self.k_dim, self.WARP_SIZE)
+        self._initialize_mma_sp_prefix(self.k_dim)
+        self._initialize_is_m_first(is_m_first)
+
+        self.reduce_k = reduce_k
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
+        self.num_elems_per_byte = num_elems_per_byte
+        self.thread_var = thread_var
+
+        if self.warp_rows == 0 or self.warp_cols == 0:
+            raise ValueError(
+                f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
+            )
+
+    def _initialize_k_dim(self, a_dtype=T.float16):
+        if isinstance(a_dtype, str):
+            a_dtype = DataType(a_dtype)
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        # When referring to the physical data movement, it should be divided by sparse_factor.
+        self.k_dim = 256 // a_dtype.bits * self.SPARSE_FACTOR
+
+    def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
+        self.local_size_a = (m_dim * k_dim) // warp_size // self.SPARSE_FACTOR
+        self.local_size_e = (m_dim * k_dim) // self.e_factor // warp_size * self.E_REPLICATE_FACTOR[self.a_dtype]
+        self.local_size_b = (n_dim * k_dim) // warp_size
+        self.local_size_out = (m_dim * n_dim) // warp_size
+
+    def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
+        self.a_dtype_abbrv = self.dtype_abbrv[a_dtype]
+        self.b_dtype_abbrv = self.dtype_abbrv[b_dtype]
+        self.accum_dtype_abbrv = self.dtype_abbrv[accum_dtype]
+
+    def _initialize_mma_sp_prefix(self, k_dim: int = 16):
+        if k_dim == 16:
+            # typically used for tfloat32
+            self.mma_prefix = "m16n8k16"
+        elif k_dim == 32:
+            # typically used for float16/bfloat16
+            self.mma_prefix = "m16n8k32"
+        elif k_dim == 64:
+            # typically used for int8/fp8
+            self.mma_prefix = "m16n8k64"
+        else:
+            raise ValueError("Unsupported k_dim")
+
+    def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
+        warp_row_tiles = self.warp_row_tiles
+        warp_col_tiles = self.warp_col_tiles
+        assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
+        assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
+        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
+        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
+
+        self.warp_rows = warp_row_tiles // m_dim
+
+        if warp_col_tiles % 16 == 0:
+            self.n_dim = 16
+            self.micro_size_y = 16
+            self.warp_cols = warp_col_tiles // 16
+        else:
+            # must be divisible by 8
+            self.n_dim = 8
+            self.micro_size_y = 8
+            self.warp_cols = warp_col_tiles // 8
+
+        self.micro_size_x = m_dim
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        self.micro_size_k = k_dim
+
+    def _initialize_is_m_first(self, is_m_first: bool | None = False):
+        if is_m_first is not None:
+            self.is_m_first = is_m_first
+
+    def get_thread_binding(self):
+        if self.thread_var is None:
+            current_frame = T.KernelLaunchFrame.Current()
+            assert current_frame is not None, "Must be called in a T.Kernel Frame"
+            return current_frame.get_thread_binding()
+        else:
+            return self.thread_var
+
+    def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
+        index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
+        if not inverse:
+            return index_map
+        inverse_index_map = index_map.inverse([warp_size, local_size_c])
+        return inverse_index_map
+
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
+        WARP_SIZE = self.WARP_SIZE
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+
+        # if is_m_first is None, then use the default value
+        if is_m_first is None:
+            is_m_first = self.is_m_first
+
+        if is_m_first:
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
+            return lane_id, warp_n, warp_m
+        else:
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
+            return lane_id, warp_n, warp_m
+
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_a = self.local_size_a
+        a_dtype = self.a_dtype
+        a_transposed = self.a_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(a_dtype).bits != 16 and a_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(a_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_a_32x16_to_shared_16x64_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_a_32x8_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_a_32x4_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {a_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_a(
+            A_local_buf,
+            A_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = A_shared_buf.shape[-1]
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            trans = self.a_transposed
+
+            for i in T.serial(warp_rows):
+                # Assign A_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.SPARSE_FACTOR
+                A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
+
+                if ldmatrix_available:
+                    T.ptx_ldmatrix(
+                        a_dtype,
+                        T.bool(trans),
+                        4,
+                        ".b16",
+                        A_local_buf.data,
+                        i * local_size_a,
+                        T.address_of(A_shared_buf_elem),
+                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                    )
+                else:
+                    for j in T.serial(local_size_a):
+                        mi, mk = mma_load_layout(tx, j)
+                        A_local_buf[i * local_size_a + j] = (
+                            A_shared_buf[wk + mk, wi + mi] if a_transposed else A_shared_buf[wi + mi, wk + mk]
+                        )
+
+        return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_e(self, E_local_buf: Buffer, E_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_e = self.local_size_e
+        a_dtype = self.a_dtype
+        e_dtype = self.e_dtype
+        trans = self.e_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        # include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+        ldmatrix_available = False  # TODO: use ldmatrix when possible
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(e_dtype).bits == 8:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 8bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 16:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 16bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 32:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_32bit_load_32x1_to_shared_16x2_layout_8bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 32bit: {a_dtype}")
+            else:
+                raise ValueError(f"Unsupported dtype: {e_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_e(
+            E_local_buf,
+            E_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            for i in T.serial(warp_rows):
+                # Assign E_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.e_factor
+                for j in T.serial(local_size_e):
+                    mi, mk = mma_load_layout(tx, j)
+                    E_local_buf[i * local_size_e + j] = E_shared_buf[wk + mk, wi + mi] if trans else E_shared_buf[wi + mi, wk + mk]
+
+        return _warp_ldmatrix_e(E_local_buf, E_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_col_tiles = self.warp_col_tiles
+        warp_cols = self.warp_cols
+        warp_k = self.warp_k
+        micro_size_y = self.micro_size_y
+        micro_size_k = self.micro_size_k
+        local_size_b = self.local_size_b
+        b_dtype = self.b_dtype
+        b_transposed = self.b_transposed
+        thread_binding = self.get_thread_binding()
+        replicate_b = self.n_dim == 16
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(b_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_b_32x32_to_shared_16x64_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_b_32x8_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {b_dtype}")
+
+        @T.macro
+        def _warp_ldmatrix_b(
+            B_local_buf,
+            B_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = B_shared_buf.shape[-1]
+            tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+            trans = not b_transposed
+
+            for i in T.serial(warp_cols):
+                # Assign B_shared_elem
+                wi, wk = (
+                    warp_n * warp_col_tiles + i * micro_size_y,
+                    rk * warp_k + ki * micro_size_k,
+                )
+
+                if ldmatrix_available:
+                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk, wi]
+
+                    if replicate_b:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b + lift(local_size_b) // 2,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, lift(local_size_b) // 2, stride, b_dtype, b_transposed),
+                        )
+                    else:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                else:
+                    # load 16x32 data from shared buffer to local buffer
+                    # must be transposed.
+                    for j in T.serial(local_size_b):
+                        mi, mk = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b + j] = (
+                            B_shared_buf[wi + mi, wk + mk] if b_transposed else B_shared_buf[wk + mk, wi + mi]
+                        )
+
+        return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
+
+    def mma_sp(self, A_local_buf: Buffer, E_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr = 0):
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_a = self.local_size_a
+        local_size_e = self.local_size_e
+        local_size_b = self.local_size_b
+        local_size_out = self.local_size_out
+        a_dtype_abbrv = self.a_dtype_abbrv
+        b_dtype_abbrv = self.b_dtype_abbrv
+        accum_dtype = self.accum_dtype
+        accum_dtype_abbrv = self.accum_dtype_abbrv
+        mma_prefix = self.mma_prefix
+        replicate_b = self.n_dim == 16
+
+        a_is_fragment = is_fragment(A_local_buf)
+        e_is_fragment = is_fragment(E_local_buf)
+        b_is_fragment = is_fragment(B_local_buf)
+        assert not e_is_fragment, f"currently E_local_buf must be a local allocation, found {E_local_buf.scope()}"
+        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
+        e_local_stride: PrimExpr = k_inner * warp_rows * local_size_e if e_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
+
+        @T.macro
+        def _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf):
+            for i, j in T.grid(warp_rows, warp_cols):
+                T.ptx_mma_sp(
+                    accum_dtype,
+                    mma_prefix,
+                    "row",
+                    "col",
+                    a_dtype_abbrv,
+                    b_dtype_abbrv,
+                    accum_dtype_abbrv,
+                    A_local_buf.data,
+                    a_local_stride + i * local_size_a,
+                    B_local_buf.data,
+                    b_local_stride + j * local_size_b,
+                    C_local_buf.data,
+                    i * warp_cols * local_size_out + j * local_size_out,
+                    E_local_buf.data,  # metadata
+                    e_local_stride + i * local_size_e,  # metadata offset
+                    self.SPARSE_SELECTOR,  # sparse_selector
+                    T.bool(False),  # saturate
+                )
+                if replicate_b:
+                    T.ptx_mma_sp(
+                        accum_dtype,
+                        mma_prefix,
+                        "row",
+                        "col",
+                        a_dtype_abbrv,
+                        b_dtype_abbrv,
+                        accum_dtype_abbrv,
+                        A_local_buf.data,
+                        a_local_stride + i * local_size_a,
+                        B_local_buf.data,
+                        b_local_stride + j * local_size_b + lift(local_size_b) // 2,
+                        C_local_buf.data,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
+                        E_local_buf.data,  # metadata
+                        e_local_stride + i * local_size_e,  # metadata offset
+                        self.SPARSE_SELECTOR,  # sparse_selector
+                        T.bool(False),  # saturate
+                    )
+
+        return _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf)
+
+    def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_out = self.local_size_out
+
+        is_global = pid_m is not None and pid_n is not None
+        BLOCK_M = block_row_warps * warp_rows
+        BLOCK_N = block_col_warps * warp_cols
+        M_DIM, n_dim = self.M_DIM, self.n_dim
+        C_buf_dims = len(C_buf.shape)
+        assert C_buf_dims in {2, 4}, "C_buf should be 2D or 4D"
+
+        thread_binding = self.get_thread_binding()
+
+        # STS
+        # MMA Store must be in simulated instead of TVM Intrins
+        # As TVM Intrins is like a hack that the threadIdx.x should be always
+        # equal to the warp_size
+        @T.macro
+        def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        if C_buf_dims == 2:
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
+                        else:
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
+
+        @T.macro
+        def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        C_buf[
+                            (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
+                            (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
+
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        assert matrix in ["A", "B"], "matrix should be either A or B"
+        matrix_is_a: bool = matrix == "A"
+        matrix_is_b: bool = matrix == "B"
+        dtype = self.a_dtype if matrix_is_a else self.b_dtype
+        dtype_bits = DataType(dtype).bits
+        transposed = self.a_transposed if matrix_is_a else self.b_transposed
+
+        # s represents spatial axis
+        # r represents reduction axis
+        # sr represents the two dims are spatial + reduction
+        # rs represents the two dims are reduction + spatial
+        # sr also can represent a non-transposed basic layout
+        # then rs also can represent a transposed basic layout
+        transform_func_sr_a: Callable = None
+        transform_func_sr_b: Callable = None
+        if dtype_bits == 32:
+            transform_func_sr_a = shared_16x16_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x16_to_mma_sp_layout_sr_b
+        elif dtype_bits == 16:
+            transform_func_sr_a = shared_16x32_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x32_to_mma_sp_layout_sr_b
+        elif dtype_bits == 8:
+            transform_func_sr_a = shared_16x64_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x64_to_mma_sp_layout_sr_b
+        else:
+            raise ValueError(f"Unsupported dtype {dtype}")
+
+        is_sr_conditions = [False]
+        is_sr_conditions.append(matrix_is_a and not transposed)
+        is_sr_conditions.append(matrix_is_b and transposed)
+        is_sr_axis_order = any(is_sr_conditions)
+
+        # the layout of mma.sync is row.col.
+        # so the b matrix expected a transposed basic layout
+        transform_func: Callable = None
+        if matrix_is_a:
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
+        elif matrix_is_b:
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
+        else:
+            raise ValueError(f"Unsupported matrix {matrix}")
+
+        assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
+
+        if matrix_is_a:
+            micro_size_s, micro_size_r = self.micro_size_x, self.micro_size_k
+        else:
+            micro_size_r, micro_size_s = self.micro_size_k, self.micro_size_y
+
+        block_row_warps, block_col_warps = (
+            self.block_row_warps,
+            self.block_col_warps,
+        )
+
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            lane_id, _ = inverse_mma_load_layout.map_indices([i, j])
+            return lane_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            _, local_id = inverse_mma_load_layout.map_indices([i, j])
+            return local_id
+
+        base_fragment = T.Fragment(
+            [micro_size_s, micro_size_r // 2 if matrix_is_a else micro_size_r]
+            if is_sr_axis_order
+            else [micro_size_r // 2 if matrix_is_a else micro_size_r, micro_size_s],
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
+
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        chunk = self.warp_k
+
+        warp_s = warp_rows if matrix_is_a else warp_cols
+        warp_r = chunk // micro_size_r
+        block_s = block_row_warps if matrix_is_a else block_col_warps
+        replicate = block_col_warps if matrix_is_a else block_row_warps
+
+        if is_sr_axis_order:
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+        else:
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+
+        return block_fragment
+
+    def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        shape = local_buf.shape
+        inverse_mma_store_layout = self.get_store_index_map(inverse=True)
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
+        local_size_out = self.local_size_out
+        block_row_warps, block_col_warps = self.block_row_warps, self.block_col_warps
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        warp_size = self.WARP_SIZE
+        is_m_first = self.is_m_first
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a thread index according to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of block_row_warps and block_col_warps are warp_rows and warp_cols
+            block_i, block_j = (i // micro_size_x) // warp_rows, (j // micro_size_y) // warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            lane_id, _ = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            if is_m_first:
+                thread_id = block_i * (block_col_warps * warp_cols) + block_j * warp_size + lane_id
+            else:
+                thread_id = block_j * (block_row_warps * warp_size) + block_i * warp_size + lane_id
+            return thread_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a local index in a single thread according
+            to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of warp_i and warp_j are warp_rows and warp_cols
+            warp_i, warp_j = (i // micro_size_x) % warp_rows, (j // micro_size_y) % warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            _, local_id = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            return warp_i * (warp_cols * local_size_out) + warp_j * local_size_out + local_id
+
+        return T.Fragment(
+            shape,
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index 814d28b66170c76a32a82875238702c9c634b66d..923bb0e1067b9ce51a7d36163fb0ec15b3c97b27 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -73,9 +73,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -88,9 +88,22 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         is_m_first: bool = False,
         thread_var: Var | None = None,
     ):
-        super().__init__(a_dtype, b_dtype, accum_dtype, a_transposed, b_transposed, block_row_warps,
-                         block_col_warps, warp_row_tiles, warp_col_tiles, chunk, reduce_k,
-                         num_elems_per_byte, is_m_first, thread_var)
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
 
     def _assign_a_shared_layout(self, layout: Layout):
         self.a_shared_layout = layout
@@ -137,13 +150,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         else:
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
-    def tcgen05mma(self,
-                   A_buf: Buffer,
-                   B_buf: Buffer,
-                   C_local_buf: Buffer,
-                   mbar,
-                   clear_accum: PrimExpr = False):
-
+    def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, clear_accum: PrimExpr = False):
         if is_tensor_memory(A_buf):
             return self.tcgen05mma_rs(A_buf, B_buf, C_local_buf, clear_accum)
 
@@ -164,23 +171,20 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         elems_in_bits = DataType(self.a_dtype).bits
         elems_in_bytes = elems_in_bits // 8
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
         accum_dtype_in_bits = DataType(accum_dtype).bits
 
         meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
-        if len(meta) != 3:
+        if len(meta) != 5:
             raise ValueError(
                 f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
-                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, atom_k = (int(x) for x in meta)
-        enable_ws = atom_m != 128
+                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
 
         # by default, we utilize non-swizzle layout offset
-        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
-                                                                               elems_in_bytes)
-        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
 
         if not a_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
@@ -203,11 +207,8 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -244,11 +245,12 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         )
         # Allocate an instruction descriptor wrapper and initialize it
         a_dtype_abbrv = self.a_dtype_abbrv
-        mask_zero = T.Cast("int32", 0)
+        mask_zero = T.Cast(T.int32, 0)
         mask0 = mask1 = mask2 = mask3 = mask_zero
 
-        num_inst_m = 4 * self.warp_row_tiles // atom_m
-        num_inst_n = self.warp_col_tiles // atom_n
+        # TCGEN05 only has one warp group
+        num_inst_m = self.block_row_warps * self.warp_row_tiles // atom_m
+        num_inst_n = self.block_col_warps * self.warp_col_tiles // atom_n
 
         # Helper to allow BufferRegion/BufferLoad as inputs
         def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
@@ -312,21 +314,26 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                     for ki in T.unroll(0, (k_dim // micro_size_k)):
                         scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
                         A_elem_offset = (
-                            ki % ak_atom_size
-                        ) * micro_size_k + i * atom_m * a_swizzle_atom_elems + (
-                            ki // ak_atom_size
-                        ) * m_dim * a_swizzle_atom_elems if a_is_k_major else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                            (ki % ak_atom_size) * micro_size_k
+                            + i * atom_m * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
 
-                        B_elem_offset = (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems + (
-                            ki % bk_atom_size
-                        ) * micro_size_k + j * atom_n * b_swizzle_atom_elems if b_is_k_major else (
-                            ki * b_swizzle_atom_elems * micro_size_k + j * atom_n *
-                            (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                        B_elem_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + j * atom_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k + j * atom_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
 
                         A_byte_offset = A_elem_offset * elems_in_bytes
                         B_byte_offset = B_elem_offset * elems_in_bytes
-                        C_offset = (i * n_dim + j * tmem_col_step
-                                   ) * accum_dtype_in_bits // 32  # 32 bits per tmem bank
+                        C_offset = (i * n_dim + j * tmem_col_step) * accum_dtype_in_bits // 32  # 32 bits per tmem bank
 
                         T.ptx_tcgen05_mma_ss(
                             a_dtype_abbrv,
@@ -373,23 +380,21 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         """
         assert is_tensor_memory(tmem_buf), "tmem_buf must reside in tensor memory (shared.tmem)"
         if len(tmem_buf.shape) != 2:
-            raise ValueError(
-                f"TCGEN5MMA expects a 2-D tensor-memory buffer, got shape {tmem_buf.shape}")
+            raise ValueError(f"TCGEN5MMA expects a 2-D tensor-memory buffer, got shape {tmem_buf.shape}")
 
         m = int(tmem_buf.shape[0])
         n = int(tmem_buf.shape[1])
         k = int(self.chunk)
 
         meta = self.get_tcgen5_mma_meta(m, n, k)
-        if len(meta) != 3:
-            raise ValueError(f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, "
-                             f"A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, _ = (int(x) for x in meta)
-
-        if m % atom_m != 0 or n % atom_n != 0:
+        if len(meta) != 5:
             raise ValueError(
-                f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})"
+                f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
             )
+        atom_m, atom_n, _, _, _ = (int(x) for x in meta)
+
+        if m % atom_m != 0 or n % atom_n != 0:
+            raise ValueError(f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})")
 
         def forward(i: PrimExpr, j: PrimExpr):
             atom_idx = (i // atom_m) + (j // atom_n) * (m // atom_m)
@@ -422,11 +427,11 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         return Layout([m, n], forward)
 
     def get_tcgen5_mma_meta(self, m: int, n: int, k: int):
-        return _ffi_api.get_tcgen5_mma_meta(
-            int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
+        return _ffi_api.get_tcgen5_mma_meta(int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
 
-    def get_tcgen5_instr_desc(self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool,
-                              b_is_k_major: bool, scale_in_a: int, scale_in_b: int) -> PrimExpr:
+    def get_tcgen5_instr_desc(
+        self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool, b_is_k_major: bool, scale_in_a: int, scale_in_b: int
+    ) -> PrimExpr:
         desc = _ffi_api.get_tcgen5_instr_desc(
             atom_m,
             atom_n,
diff --git a/tilelang/intrinsics/utils.py b/tilelang/intrinsics/utils.py
index 7fc9bab134052bf6ac9dca197e9b1269732e1402..fb24a4add2f157eb296f21b4dad185d83ae6e243 100644
--- a/tilelang/intrinsics/utils.py
+++ b/tilelang/intrinsics/utils.py
@@ -10,7 +10,7 @@ from .mma_layout import (
     mma_store_32x8_to_shared_16x16_layout,
     mma_store_32x2_to_shared_8x8_layout_fp64,
 )
-from .mfma_layout import (thread_id_shared_access_64x4_to_16x16_layout_C_n_m)
+from .mfma_layout import thread_id_shared_access_64x4_to_16x16_layout_C_n_m
 
 from .mma_layout import get_swizzle_layout  # noqa: F401
 from .mma_layout import make_mma_swizzle_layout  # noqa: F401
diff --git a/tilelang/intrinsics/wgmma_macro_generator.py b/tilelang/intrinsics/wgmma_macro_generator.py
index 51a90fba10c60d92c5145d4ef2fd318bca26e911..864420c77174c34b5b75165bbe1398dcc96e4dac 100644
--- a/tilelang/intrinsics/wgmma_macro_generator.py
+++ b/tilelang/intrinsics/wgmma_macro_generator.py
@@ -15,9 +15,11 @@ from tilelang.layout import (
     make_linear_layout,
 )
 from tvm.runtime import convert
-from tilelang.intrinsics.mma_layout import (shared_16x8_to_mma_32x4_layout_sr_a,
-                                            shared_16x16_to_mma_32x8_layout_sr_a,
-                                            shared_16x32_to_mma_32x16_layout_sr_a)
+from tilelang.intrinsics.mma_layout import (
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
 
 lift = convert
 
@@ -81,9 +83,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -96,9 +98,22 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         is_m_first: bool | None = False,
         thread_var: Var | None = None,
     ):
-        super().__init__(a_dtype, b_dtype, accum_dtype, a_transposed, b_transposed, block_row_warps,
-                         block_col_warps, warp_row_tiles, warp_col_tiles, chunk, reduce_k,
-                         num_elems_per_byte, is_m_first, thread_var)
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
         self._initialize_wgmma_prefix(self.n_dim)
 
     def _assign_a_shared_layout(self, layout: Layout):
@@ -112,12 +127,12 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
     def _initialize_wgmma_prefix(self, n_dim: int = 16):
         inst_m, inst_n = 64, gcd(self.warp_col_tiles, 256)
         assert inst_n % 8 == 0, (
-            f"inst_n must be a multiple of 8, got {inst_n} "
-            f"(block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})")
+            f"inst_n must be a multiple of 8, got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
         # Validate inst_n: Hopper WGMMA supports n in [8, 256] and multiple of 8
         assert 8 <= inst_n <= 256, (
-            f"inst_n must be within [8, 256], got {inst_n} "
-            f"(block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})")
+            f"inst_n must be within [8, 256], got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
         # 256 bits per instruction
         inst_k = 256 // DataType(self.a_dtype).bits
         self.wgmma_inst_m = inst_m
@@ -160,13 +175,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         else:
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
-    def wgmma(self,
-              A_region: BufferRegion,
-              B_region: BufferRegion,
-              C_region: BufferRegion,
-              clear_accum: PrimExpr = False,
-              wg_wait: int = 0):
-
+    def wgmma(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
         if is_fragment(A_region):
             return self.wgmma_rs(A_region, B_region, C_region, clear_accum, wg_wait)
 
@@ -195,16 +206,13 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         elems_in_bytes = elems_in_bits // 8
 
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
         accum_bits = DataType(accum_dtype).bits
         accum_regs = ((m_dim // 64) * warp_cols * local_size_out * accum_bits + 31) // 32
 
         # by default, we utilize non-swizzle layout offset
-        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
-                                                                               elems_in_bytes)
-        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
 
         if not a_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
@@ -220,19 +228,15 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                 if a_m_axis_atoms <= 1:
                     a_leading_byte_offset = 0
                 else:
-                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (
-                        a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
+                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
 
                 if a_m_axis_atoms <= 1:
                     a_stride_byte_offset = 8 * elems_in_bytes * m_dim
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -275,12 +279,8 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
             desc_a = T.alloc_wgmma_desc()
             desc_b = T.alloc_wgmma_desc()
-            T.initialize_wgmma_descriptor(desc_a, A_ptr, a_swizzle_mode,
-                                          int(a_leading_byte_offset >> 4),
-                                          int(a_stride_byte_offset >> 4))
-            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode,
-                                          int(b_leading_byte_offset >> 4),
-                                          int(b_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_a, A_ptr, a_swizzle_mode, int(a_leading_byte_offset >> 4), int(a_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
             T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
             T.warpgroup_arrive()
 
@@ -291,21 +291,41 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                         warp_i = (warp_m // 4) * num_inst_m + i
                         warp_j = warp_n * num_inst_n + j
                         A_offset = (
-                            ki % ak_atom_size
-                        ) * micro_size_k + warp_i * 64 * a_swizzle_atom_elems + (
-                            ki // ak_atom_size
-                        ) * m_dim * a_swizzle_atom_elems if a_is_k_major else warp_i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
-                        B_offset = (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems + (
-                            ki % bk_atom_size
-                        ) * micro_size_k + warp_j * wgmma_inst_n * b_swizzle_atom_elems if b_is_k_major else (
-                            ki * b_swizzle_atom_elems * micro_size_k + warp_j * wgmma_inst_n *
-                            (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                            (ki % ak_atom_size) * micro_size_k
+                            + warp_i * 64 * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else warp_i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
+                        B_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
                         C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
-                        T.ptx_wgmma_ss(accum_dtype, wgmma_prefix, a_is_k_major, b_is_k_major,
-                                       a_dtype_abbrv, b_dtype_abbrv, accum_dtype_abbrv, desc_a.data,
-                                       (A_offset * elems_in_bytes) >> 4, desc_b.data,
-                                       (B_offset * elems_in_bytes) >> 4, C_buf.data, C_offset,
-                                       scale_out, scale_in_a, scale_in_b)
+                        T.ptx_wgmma_ss(
+                            accum_dtype,
+                            wgmma_prefix,
+                            a_is_k_major,
+                            b_is_k_major,
+                            a_dtype_abbrv,
+                            b_dtype_abbrv,
+                            accum_dtype_abbrv,
+                            desc_a.data,
+                            (A_offset * elems_in_bytes) >> 4,
+                            desc_b.data,
+                            (B_offset * elems_in_bytes) >> 4,
+                            C_buf.data,
+                            C_offset,
+                            scale_out,
+                            scale_in_a,
+                            scale_in_b,
+                        )
 
             T.warpgroup_commit_batch()
             if wg_wait >= 0:
@@ -314,12 +334,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
         return _warp_mma(A_ptr, B_ptr, C_buf)
 
-    def wgmma_rs(self,
-                 A_region: BufferRegion,
-                 B_region: BufferRegion,
-                 C_region: BufferRegion,
-                 clear_accum: PrimExpr = False,
-                 wg_wait: int = 0):
+    def wgmma_rs(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
         local_size_a = self.local_size_a
         local_size_out = self.local_size_out
         a_dtype_abbrv = self.a_dtype_abbrv
@@ -344,14 +361,10 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         b_is_k_major = self.b_transposed
 
         b_swizzle_mode = self._determinate_swizzle_mode(B_region, self.b_shared_layout)
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -390,9 +403,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
             tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
 
             desc_b = T.alloc_wgmma_desc()
-            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode,
-                                          int(b_leading_byte_offset >> 4),
-                                          int(b_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
             T.warpgroup_fence_operand(A_buf, num_regs=a_regs)
             T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
             T.warpgroup_arrive()
@@ -405,11 +416,15 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
                         A_offset = ki * warp_rows * local_size_a + i * local_size_a
                         B_offset = (
-                            ki // bk_atom_size
-                        ) * n_dim * b_swizzle_atom_elems + warp_j * wgmma_inst_n * b_swizzle_atom_elems + (
-                            ki % bk_atom_size) * micro_size_k if b_is_k_major else (
-                                ki * b_swizzle_atom_elems * micro_size_k + warp_j * wgmma_inst_n *
-                                (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
                         C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
                         T.ptx_wgmma_rs(
                             accum_dtype,
@@ -460,6 +475,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A"], "matrix should be A for WGMMA"
         dtype = self.a_dtype
         dtype_bits = DataType(dtype).bits
@@ -488,8 +504,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         # the layout of mma.sync is row.col.
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
 
         assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
 
@@ -500,7 +515,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -531,20 +546,12 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         replicate = block_col_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([block_s, 1],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_s, warp_r],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
         else:
             # rs condition, transposed_a matrix
-            warp_fragment = base_fragment.repeat([1, block_s],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_r, warp_s],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
 
         return block_fragment
 
diff --git a/tilelang/ir.py b/tilelang/ir.py
index cccf97e0a0d8ea23236825751ab3847e16faf2cb..b4a7de5ebb22fd43d7f8e23813966e72f9d8ca2a 100644
--- a/tilelang/ir.py
+++ b/tilelang/ir.py
@@ -7,23 +7,19 @@ from tilelang import _ffi_api
 
 
 @tvm_ffi.register_object("tl.Fill")
-class Fill(Node, Scriptable):
-    ...
+class Fill(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.AtomicAdd")
-class AtomicAdd(Node, Scriptable):
-    ...
+class AtomicAdd(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.Copy")
-class Copy(Node, Scriptable):
-    ...
+class Copy(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.Conv2DIm2Col")
-class Conv2DIm2ColOp(Node, Scriptable):
-    ...
+class Conv2DIm2ColOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.GemmWarpPolicy")
@@ -32,48 +28,49 @@ class GemmWarpPolicy(Node, Scriptable):
     m_warp: int
     n_warp: int
 
-    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target,
-                               is_wgmma: bool):
-        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target,
-                                                    is_wgmma)
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool):
+        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma)
+        return self.m_warp, self.n_warp
+
+
+@tvm_ffi.register_object("tl.GemmSPWarpPolicy")
+class GemmSPWarpPolicy(Node, Scriptable):
+    policy_type: int
+    m_warp: int
+    n_warp: int
+
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool, bits: int):
+        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma, bits)
         return self.m_warp, self.n_warp
 
 
 @tvm_ffi.register_object("tl.Gemm")
-class Gemm(Node, Scriptable):
-    ...
+class Gemm(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.GemmSP")
-class GemmSP(Node, Scriptable):
-    ...
+class GemmSP(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.FinalizeReducerOp")
-class FinalizeReducerOp(Node, Scriptable):
-    ...
+class FinalizeReducerOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ParallelOp")
-class ParallelOp(Node, Scriptable):
-    ...
+class ParallelOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ReduceOp")
-class ReduceOp(Node, Scriptable):
-    ...
+class ReduceOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.CumSumOp")
-class CumSumOp(Node, Scriptable):
-    ...
+class CumSumOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.RegionOp")
-class RegionOp(Node, Scriptable):
-    ...
+class RegionOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ReduceType")
-class ReduceType(Node, Scriptable):
-    ...
+class ReduceType(Node, Scriptable): ...
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index 24378ac8acdcc26103fd8d06acc44e769fc6a67b..eac206f7245a62929f418daba245329d07dd127e 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -3,6 +3,7 @@ This module provides an auto-tuning infrastructure for TileLang (tl) programs.
 It includes functionality to JIT-compile TileLang programs into a runnable
 kernel adapter using TVM.
 """
+
 from __future__ import annotations
 
 from dataclasses import dataclass
@@ -16,14 +17,15 @@ from typing import (
     Literal,
 )
 from collections.abc import Iterable
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 from tilelang import tvm as tvm
-from tilelang.language.v2 import PrimFunc
-from tilelang.jit.adapter.utils import is_metal_target
+from tilelang.language.v2 import PrimFunc, PrimFuncCreater, prim_func
+from tilelang.language.v2.annot import Annot
 from tvm.target import Target
 
 from tilelang.jit.kernel import JITKernel
@@ -38,15 +40,16 @@ from tqdm.auto import tqdm
 
 logger = getLogger(__name__)
 
-_P = ParamSpec('_P')
-_KP = ParamSpec('_KP')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_KP = ParamSpec("_KP")
+_T = TypeVar("_T")
+_Ret = TypeVar("_Ret")
 
 
 def compile(
     func: PrimFunc[_KP, _T] = None,
     out_idx: list[int] | int | None = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto",
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     verbose: bool = False,
@@ -61,8 +64,9 @@ def compile(
         The TileLang TIR function to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Execution backend to use for kernel execution (default: "cython").
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Execution backend to use for kernel execution. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     target : Union[str, Target], optional
         Compilation target, either as a string or a TVM Target object (default: "auto").
     target_host : Union[str, Target], optional
@@ -73,15 +77,31 @@ def compile(
         Additional keyword arguments to pass to the Compiler PassContext.
         Refer to `tilelang.transform.PassConfigKey` for supported options.
     """
+
     assert isinstance(func, PrimFunc), f"target function must be a PrimFunc but got {type(func)}"
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
+
+    if hasattr(func, "out_idx_override"):
+        if func.out_idx_override is not None and out_idx is not None:
+            raise ValueError("Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors")
+        out_idx = func.out_idx_override or out_idx
 
     # This path is not a performance critical path, so we can afford to convert the target.
     target = Target(determine_target(target))
 
-    if is_metal_target(target):
-        assert execution_backend == 'torch', 'Currently metal target only support `tl.jit(execution_backend="torch")`'
+    # Resolve execution backend (handles aliases, auto, validation per target)
+    requested_backend = execution_backend
+    from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+
+    execution_backend = resolve_execution_backend(requested_backend, target)
+    if verbose:
+        allowed_now = allowed_backends_for_target(target, include_unavailable=False)
+        logger.info(
+            "Execution backend resolved -> '%s' (requested='%s', target='%s', allowed: %s)",
+            execution_backend,
+            requested_backend,
+            target.kind.name,
+            ", ".join(sorted(allowed_now)),
+        )
 
     return cached(
         func=func,
@@ -95,16 +115,18 @@ def compile(
     )
 
 
-def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
-                out_idx: list[int] | int | None = None,
-                execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-                target: str | Target = "auto",
-                target_host: str | Target | None = None,
-                verbose: bool = False,
-                pass_configs: dict[str, Any] | None = None,
-                compile_flags: list[str] | str | None = None,
-                num_workers: int = None,
-                ignore_error: bool = False) -> list[JITKernel[_KP, _T]]:
+def par_compile(
+    funcs: Iterable[PrimFunc[_KP, _T]],
+    out_idx: list[int] | int | None = None,
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto",
+    target: str | Target = "auto",
+    target_host: str | Target | None = None,
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    compile_flags: list[str] | str | None = None,
+    num_workers: int = None,
+    ignore_error: bool = False,
+) -> list[JITKernel[_KP, _T]]:
     """
     Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
     Parameters
@@ -113,8 +135,9 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
         The TileLang TIR functions to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Execution backend to use for kernel execution (default: "cython").
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Execution backend to use for kernel execution. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     target : Union[str, Target], optional
         Compilation target, either as a string or a TVM Target object (default: "auto").
     target_host : Union[str, Target], optional
@@ -125,7 +148,7 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
         Additional keyword arguments to pass to the Compiler PassContext.
         Refer to `tilelang.transform.PassConfigKey` for supported options.
     """
-    with concurrent.futures.ThreadPoolExecutor(num_workers, 'tl-par-comp') as executor:
+    with concurrent.futures.ThreadPoolExecutor(num_workers, "tl-par-comp") as executor:
         futures = []
         future_map = {}
         for i, func in enumerate(funcs):
@@ -144,9 +167,9 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
             futures.append(future)
         results = [... for _ in futures]
         for future in tqdm(
-                concurrent.futures.as_completed(futures),
-                total=len(futures),
-                desc="Parallel Compiling",
+            concurrent.futures.as_completed(futures),
+            total=len(futures),
+            desc="Parallel Compiling",
         ):
             idx = future_map[future]
             if ignore_error:
@@ -162,10 +185,78 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
 
 
 @dataclass
-class JITImpl(Generic[_P, _KP, _T]):
-    func: Callable[_P, _T] | PrimFunc[_KP, _T]
+class JITImpl(Generic[_P, _KP, _T, _Ret]):
+    """
+    Detailed Just-In-Time wrapper for TileLang programs.
+
+    This dataclass encapsulates the configuration and runtime helpers used by the
+    top-level `jit` and `jit2` decorators. It represents a configured JIT
+    "factory" that can (a) elaborate TileLang/PrimFunc creators into concrete
+    TIR (PrimFunc), (b) compile those TIR functions into runnable kernels via
+    the TVM bridge, (c) cache compiled kernels keyed by call-site arguments
+    (and optional tuning parameters), and (d) provide parallel compilation
+    helpers for batch autotuning workflows.
+
+    Attributes
+    ----------
+    out_idx : list[int] | int | None
+        Which output tensor(s) of the compiled kernel should be returned to the
+        caller. Accepts a single index, a list of indices, or None to return all.
+    execution_backend : Literal["dlpack", "ctypes", "cython"]
+        Backend used for exchanging arguments and executing the generated kernel.
+    target : str | tvm.target.Target
+        TVM compilation target (e.g. "cuda", "llvm", or "auto").
+    target_host : str | tvm.target.Target | None
+        Host target used for cross-compilation, or None to infer/default.
+    verbose : bool
+        Enable verbose messages during compilation/build.
+    pass_configs : dict[str, Any] | None
+        Extra TVM pass configuration options forwarded to the compiler's
+        PassContext.
+    debug_root_path : str | None
+        If provided, compiled kernel source and the elaborated Python program
+        are written to this directory to ease debugging and inspection.
+    compile_flags : list[str] | str | None
+        Additional flags passed to the compiler. A single string will be converted
+        to a single-element list.
+    func_source : str
+        Original Python source string from which the PrimFunc or creator was
+        derived. Used for diagnostics and debug dumps.
+    signature : inspect.Signature
+        Function signature of the original Python function (useful for tooling).
+    v2 : bool
+        Indicates whether the object wraps a "v2" PrimFunc creator (True) or a
+        plain callable / PrimFunc (False). v2-mode enables argument conversion
+        hooks and a distinct cache keying strategy.
+    func : Callable | PrimFunc | PrimFuncCreater
+        The underlying object: either a user function that returns a PrimFunc
+        (creator), a PrimFuncCreater, or an already-constructed PrimFunc.
+        For presentation/readability the function is stored last in the dataclass.
+
+    Behavioral summary
+    ------------------
+    - get_tir(*args, **kwargs)
+        Converts provided call-site arguments into a concrete PrimFunc. If the
+        wrapped object is a PrimFuncCreater or a user callable, it is invoked
+        with the given arguments. If the wrapped object is already a PrimFunc,
+        it is returned as-is.
+
+    - compile(...)
+        A convenience wrapper that elaborates and immediately compiles a single
+        PrimFunc into a JITKernel using the module-level `compile` function.
+        When `debug_root_path` is set, the compiled C kernel and the source
+        Python program are saved for inspection.
+
+    - par_compile(configs, ...)
+        Accepts an iterable of configs (either dicts mapping keyword args or
+        tuples mapping to positional args). Each config is elaborated to a
+        PrimFunc and the resulting set is compiled in parallel via the
+        module-level `par_compile` helper. Returns a list of JITKernel objects
+        in the same order as the provided configs.
+    """
+
     out_idx: list[int] | int | None
-    execution_backend: Literal["dlpack", "ctypes", "cython"]
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"]
     target: str | Target
     target_host: str | Target
     verbose: bool
@@ -174,6 +265,14 @@ class JITImpl(Generic[_P, _KP, _T]):
     compile_flags: list[str] | str | None
     func_source: str
     signature: inspect.Signature
+    lazy_jit: bool
+    # place func at the last element for better __repr__
+    func: Callable[_P, _T] | PrimFunc[_KP, _T]
+
+    @property
+    def annot(self) -> dict[str, Annot]:
+        assert self.lazy_jit, "annot is only support in @tilelang.jit2"
+        return self.func.func_annot.annots
 
     def __post_init__(self):
         if self.debug_root_path is not None and not path.isabs(self.debug_root_path):
@@ -183,24 +282,49 @@ class JITImpl(Generic[_P, _KP, _T]):
             except NameError:
                 self.debug_root_path = path.abspath(self.debug_root_path)
         self._kernel_cache: dict[tuple, Kernel] = {}
+        self._tuner_cache: dict[tuple, Kernel] = {}
 
     def get_tir(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_KP, _T]:
-        program_result_source = self.func
-        if isinstance(program_result_source, PrimFunc):
-            program_result = program_result_source
-        elif callable(program_result_source):
-            program_result = program_result_source(*args, **kwargs)
+        """
+        Retrieve a TIR (Tensor Intermediate Representation) PrimFunc from the stored callable or object.
+        """
+        if isinstance(self.func, PrimFuncCreater):
+            tir = self.func(*args, **kwargs)
+        elif isinstance(self.func, PrimFunc):
+            tir = self.func
+        elif callable(self.func):
+            tir = self.func(*args, **kwargs)
         else:
-            raise ValueError(f"Invalid function type: {type(program_result_source)}")
-        return program_result
-
-    def par_compile(self,
-                    configs: Iterable[dict[str, Any] | tuple[str, Any]],
-                    num_workers: int = None,
-                    ignore_error: bool = False) -> list[JITKernel[_KP, _T]]:
+            raise ValueError(f"Invalid function type: {type(self.func)}")
+        assert isinstance(tir, PrimFunc), f"target function must be a PrimFunc but got {type(tir)}"
+        return tir
+
+    def par_compile(
+        self, configs: Iterable[dict[str, Any] | tuple[str, Any]], num_workers: int = None, ignore_error: bool = False
+    ) -> list[JITKernel[_KP, _T]]:
+        """
+        Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
+        Parameters
+        ----------
+        configs : Iterable[Union[dict[str, Any], tuple[Any, ...]]]
+            The configurations to elaborate and compile. Each config can be either
+            a dictionary mapping keyword arguments to values, or a tuple of positional
+            arguments.
+        num_workers : int, optional
+            Number of parallel workers to use for compilation. Defaults to None,
+            which lets the system decide.
+        ignore_error : bool, optional
+            If True, compilation errors for individual configs will be logged
+            as warnings and the corresponding result will be None. If False,
+            any compilation error will raise an exception. Defaults to False.
+        Returns
+        -------
+        List[JITKernel]
+            A list of compiled JITKernel objects corresponding to the provided configs.
+        """
         configs = list(configs)
         funcs = []
-        for cfg in tqdm(configs, desc='Elaborating'):
+        for cfg in tqdm(configs, desc="Elaborating"):
             if isinstance(cfg, tuple):
                 funcs.append(self.get_tir(*cfg))
             elif isinstance(cfg, dict):
@@ -217,9 +341,10 @@ class JITImpl(Generic[_P, _KP, _T]):
             pass_configs=self.pass_configs,
             compile_flags=self.compile_flags,
             num_workers=num_workers,
-            ignore_error=ignore_error)
+            ignore_error=ignore_error,
+        )
 
-    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel[_KP, _T]:
+    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         func = self.get_tir(*args, **kwargs)
         kernel_result = compile(
             func,
@@ -234,50 +359,76 @@ class JITImpl(Generic[_P, _KP, _T]):
 
         if self.debug_root_path:
             if isinstance(self.func, PrimFunc):
-                func_name = self.func.attrs['global_symbol']
+                func_name = self.func.attrs["global_symbol"]
             else:
-                func_name = getattr(self.func, '__name__', 'jit_kernel')
-            kernel_file = f'tilelang_jit_kernel_{func_name}.c'
-            program_file = f'tilelang_jit_program_{func_name}.py'
+                func_name = getattr(self.func, "__name__", "jit_kernel")
+            kernel_file = f"tilelang_jit_kernel_{func_name}.c"
+            program_file = f"tilelang_jit_program_{func_name}.py"
             makedirs(self.debug_root_path, exist_ok=True)
-            with open(path.join(self.debug_root_path, kernel_file), 'w') as f:
+            with open(path.join(self.debug_root_path, kernel_file), "w") as f:
                 print(kernel_result.get_kernel_source(), file=f)
-            with open(path.join(self.debug_root_path, program_file), 'w') as f:
+            with open(path.join(self.debug_root_path, program_file), "w") as f:
                 print(func.script(), file=f)
 
         return kernel_result
 
-    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel[_KP, _T]:
+    def parse_cache_key(self, *args: _P.args, **kwargs: _P.kwargs):
+        if isinstance(self.func, PrimFuncCreater):
+            tune_params = kwargs.pop("__tune_params", {})
+            return self.func.func_annot.parse_key(*args, **kwargs, **tune_params)
+        else:
+            tune_params = kwargs.pop("__tune_params", {})
+            key_args_tuple = args
+            key_kwargs_tuple = tuple(sorted(kwargs.items()))
+            tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
+            key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
+            return key
+
+    def convert_kernel_args(self, *args: _P.args, **kwargs: _P.kwargs):
+        if isinstance(self.func, PrimFuncCreater):
+            tune_params = kwargs.pop("__tune_params", {})
+            return self.func.func_annot.convert_to_kernel_args(*args, **kwargs, **tune_params)
+        else:
+            raise NotImplementedError("convert_arg_to_kernel_args is only implemented for PrimFuncCreater.")
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         # Separate out the tuning parameters from the user's kwargs
-        tune_params = kwargs.pop('__tune_params', {})
         # Whether to return the compile arguments (out_idx, target, target_host, etc.) for autotuner cache
-        return_compile_arguments = kwargs.pop('__return_compile_arguments', False)
+        return_compile_arguments = kwargs.pop("__return_compile_arguments", False)
         if return_compile_arguments:
+            logger.warning("`__return_compile_arguments` is deprecated and will be removed in future versions.")
             compile_args = {
-                'out_idx': self.out_idx,
-                'execution_backend': self.execution_backend,
-                'target': self.target,
-                'target_host': self.target_host,
-                'verbose': self.verbose,
-                'pass_configs': self.pass_configs,
-                'compile_flags': self.compile_flags,
+                "out_idx": self.out_idx,
+                "execution_backend": self.execution_backend,
+                "target": self.target,
+                "target_host": self.target_host,
+                "verbose": self.verbose,
+                "pass_configs": self.pass_configs,
+                "compile_flags": self.compile_flags,
             }
             return compile_args
 
-        key_args_tuple = args
-        key_kwargs_tuple = tuple(sorted(kwargs.items()))
-        tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
-        key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
+        key = self.parse_cache_key(*args, **kwargs)
+
+        tune_params = kwargs.pop("__tune_params", {})
+
+        kernel = self._kernel_cache.get(key, None)
+        if kernel is None:
+            kernel = self.compile(*args, **kwargs, **tune_params)
+            self._kernel_cache[key] = kernel
+
+        if self.lazy_jit:
+            args = self.func.func_annot.convert_to_kernel_args(*args, **kwargs, **tune_params)
+            return kernel(*args)
+        else:
+            return kernel
 
-        if key not in self._kernel_cache:
-            self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
 
-        return self._kernel_cache[key]
+ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"]
 
 
 @overload
-def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T]:
-    ...
+def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]: ...
 
 
 @overload
@@ -286,26 +437,26 @@ def jit(
     out_idx: Any = None,
     target: str | Target = "auto",
     target_host: str | Target = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+    execution_backend: ExecutionBackend = "auto",
     verbose: bool = False,
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
-    compile_flags: list[str] | str | None = None
-) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T]]:
-    ...
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]]: ...
 
 
 def jit(  # This is the new public interface
-        func: Callable[_P, _T] | PrimFunc | None = None,
-        *,  # Indicates subsequent arguments are keyword-only
-        out_idx: Any = None,
-        target: str | Target = "auto",
-        target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        verbose: bool = False,
-        pass_configs: dict[str, Any] | None = None,
-        debug_root_path: str | None = None,
-        compile_flags: list[str] | str | None = None):
+    func: Callable[_P, _T] | PrimFunc | None = None,
+    *,  # Indicates subsequent arguments are keyword-only
+    out_idx: Any = None,
+    target: str | Target = "auto",
+    target_host: str | Target = None,
+    execution_backend: ExecutionBackend = "auto",
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+):
     """
     Just-In-Time (JIT) compiler decorator for TileLang functions.
 
@@ -322,8 +473,9 @@ def jit(  # This is the new public interface
         Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Backend for kernel execution and argument passing. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     verbose : bool, optional
         Enables verbose logging during compilation. Defaults to False.
     pass_configs : Optional[Dict[str, Any]], optional
@@ -337,16 +489,14 @@ def jit(  # This is the new public interface
         Either a JIT-compiled wrapper around the input function, or a configured decorator
         instance that can then be applied to a function.
     """
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
 
     def decorator(func: Callable[_P, _T]) -> JITImpl[_P, _T]:
-        if isinstance(func, PrimFunc):
+        if isinstance(func, (PrimFunc, PrimFuncCreater)):
             orig_func = func.orig_func
         else:
             orig_func = func
         return JITImpl(
-            func,
+            func=func,
             out_idx=out_idx,
             execution_backend=execution_backend,
             target=target,
@@ -357,9 +507,63 @@ def jit(  # This is the new public interface
             compile_flags=compile_flags,
             func_source=inspect.getsource(orig_func),
             signature=inspect.signature(orig_func),
+            lazy_jit=False,
         )
 
     if func is not None:
         return decorator(func)
     else:
         return decorator
+
+
+@overload
+def lazy_jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]: ...
+
+
+@overload
+def lazy_jit(
+    *,
+    out_idx: Any = None,
+    target: str | Target = "auto",
+    target_host: str | Target = None,
+    execution_backend: ExecutionBackend = "auto",
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]: ...
+
+
+def lazy_jit(
+    func: Callable[_P, _T] | PrimFunc | None = None,
+    *,  # Indicates subsequent arguments are keyword-only
+    target: str | Target = "auto",
+    target_host: str | Target = None,
+    execution_backend: ExecutionBackend = "auto",
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+):
+    compile_args = dict(
+        out_idx=None,
+        execution_backend=execution_backend,
+        target=target,
+        target_host=target_host,
+        verbose=verbose,
+        pass_configs=pass_configs,
+        debug_root_path=debug_root_path,
+        compile_flags=compile_flags,
+    )
+
+    def decorator(func: Callable[_P, _T]):
+        pf: PrimFunc[_P, _T] | PrimFuncCreater[_P, _T] = prim_func(func, generator=True)
+        # if isinstance(pf, PrimFunc):
+        #     compile_args.pop('debug_root_path', None)
+        #     return compile(pf, **compile_args)
+        # else:
+        return JITImpl(
+            func=pf, **compile_args, func_source=inspect.getsource(pf.orig_func), signature=inspect.signature(pf.orig_func), lazy_jit=True
+        )
+
+    return decorator(func) if func is not None else decorator
diff --git a/tilelang/jit/adapter/__init__.py b/tilelang/jit/adapter/__init__.py
index 0e8fb98c81b9b79b7d1fd24e675b014cf231fb84..f511608fcd71334907d5cce7e1425109914dce70 100644
--- a/tilelang/jit/adapter/__init__.py
+++ b/tilelang/jit/adapter/__init__.py
@@ -1,6 +1,7 @@
 from .base import BaseKernelAdapter  # noqa: F401
-from .dlpack import TorchDLPackKernelAdapter  # noqa: F401
+from .tvm_ffi import TVMFFIKernelAdapter  # noqa: F401
 from .ctypes import CtypesKernelAdapter  # noqa: F401
 from .cython import CythonKernelAdapter  # noqa: F401
 from .nvrtc import NVRTCKernelAdapter  # noqa: F401
 from .torch import MetalKernelAdapter  # noqa: F401
+from .cutedsl import CuTeDSLKernelAdapter  # noqa: F401
diff --git a/tilelang/jit/adapter/base.py b/tilelang/jit/adapter/base.py
index 9d998bc966fd0625e5942f3d2abfd47c36132dee..3669f9e35c6f0d667d6389adf23a1caf0109b865 100644
--- a/tilelang/jit/adapter/base.py
+++ b/tilelang/jit/adapter/base.py
@@ -1,13 +1,14 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
 from typing import Any, Callable
 from tilelang.engine.param import KernelParam
+import torch
 
 
 class BaseKernelAdapter(ABC):
-
     func: Callable | None = None
 
     def __init__(self, mod, params: list[KernelParam], result_idx: list[int]) -> None:
@@ -23,18 +24,14 @@ class BaseKernelAdapter(ABC):
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                )
+                raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
         elif isinstance(result_idx, list):
             for i, idx in enumerate(result_idx):
                 if idx >= len(params) or idx < -len(params):
-                    raise ValueError(
-                        f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                    )
+                    raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
                 if idx < 0:
                     result_idx[i] = len(params) + idx
         else:
@@ -46,11 +43,54 @@ class BaseKernelAdapter(ABC):
     def _convert_torch_func(self) -> callable:
         pass
 
+    # --- Common helpers to align with PyTorch stream/device semantics ---
+    @staticmethod
+    def get_current_stream_functor() -> Callable[[], int]:
+        """Return a callable that reads Torch's current CUDA stream pointer.
+
+        The returned lambda yields the raw CUDA stream handle of the current
+        PyTorch stream on the active device. It's a thunk (evaluated at call
+        time) so that any upstream stream guards are respected. If CUDA is
+        unavailable, it returns a lambda that yields 0.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                get_stream = torch._C._cuda_getCurrentRawStream
+                return lambda: get_stream(current_device())
+            except Exception:
+                # Fallback to Python API if internal handles are unavailable
+                return lambda: int(torch.cuda.current_stream().cuda_stream)
+        # CPU or CUDA unavailable: no stream semantics
+        return lambda: 0
+
+    @staticmethod
+    def get_current_device_functor() -> Callable[[], torch.device]:
+        """Return a callable that yields Torch's current device.
+
+        Similar to the stream functor, we capture a callable that, when called,
+        fetches the current device according to PyTorch. On CPU or when CUDA is
+        unavailable, returns ``torch.device('cpu')``.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                return lambda: torch.device("cuda", current_device())
+            except Exception:
+                return lambda: torch.device("cuda", torch.cuda.current_device())
+        # CPU fallback
+        return lambda: torch.device("cpu")
+
     def __call__(self, *args: Any, **kwds: Any) -> Any:
         return self.func(*args, **kwds)
 
-    def get_kernel_source(self) -> str:
-        return self.mod.imported_modules[0].get_source()
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
+        if kernel_only:
+            return self.mod.imports[0].inspect_source()
+        else:
+            return self.mod.inspect_source() + "\n\n" + self.mod.imports[0].inspect_source()
 
     def _post_init(self):
         self.func = self._convert_torch_func()
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
index 648c66c1c48d77d492274f3921f41ba531fde2fb..b7cac9d6fd049bc2db5875f09de59f97cee3f116 100644
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ b/tilelang/jit/adapter/ctypes/adapter.py
@@ -1,6 +1,6 @@
 """The profiler and convert to torch utils"""
-from __future__ import annotations
 
+from __future__ import annotations
 import torch
 from ..base import BaseKernelAdapter
 import ctypes
@@ -15,6 +15,7 @@ from tilelang.utils.target import determine_target
 from tilelang.utils.language import retrieve_func_from_module
 
 
+# TODO(lei): remove ctypes adapter.
 class CtypesKernelAdapter(BaseKernelAdapter):
     """Adapter class that converts TVM/TIR functions to callable CUDA kernels using ctypes.
 
@@ -29,9 +30,9 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     ir_module: tvm.IRModule | None = None
     # The global source code of the kernel -> global means the source code of the kernel
     # that is not wrapped by the wrapper code
-    kernel_global_source: str | None = None
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
     lib: ctypes.CDLL | None = None  # Compiled library handle
-    wrapped_source: str | None = None  # Generated C++ wrapper code
     # Maps symbolic variables to their corresponding buffer and shape indices
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
     # Pass configs for the compiler
@@ -41,17 +42,20 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     param_dtypes: list[torch.dtype] | None = None  # Cache for parameter dtypes
     param_shapes: list[list] | None = None  # Cache for parameter shapes
 
-    def __init__(self,
-                 params: list[TensorType],
-                 result_idx: list[int],
-                 target: str,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -63,7 +67,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
         """
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -71,7 +76,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
             self.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         self.param_shapes = []
         for param in params:
             native_shape = []
@@ -107,21 +113,25 @@ class CtypesKernelAdapter(BaseKernelAdapter):
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
-        adapter.wrapped_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+        adapter.wrapped_source = device_kernel_source + "\n\n" + host_kernel_source
         adapter.pass_configs = pass_configs
 
         if isinstance(func_or_mod, tir.PrimFunc):
@@ -130,7 +140,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
             adapter.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        adapter.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
         adapter.param_shapes = []
         for param in params:
             native_shape = []
@@ -171,15 +182,13 @@ class CtypesKernelAdapter(BaseKernelAdapter):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -188,9 +197,7 @@ class CtypesKernelAdapter(BaseKernelAdapter):
 
         Converts PyTorch tensor pointers to C void pointers for ctypes interface.
         """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
+        ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args]
         ctypes_args.append(ctypes.c_void_p(stream))
         self.lib.call(*ctypes_args)
 
@@ -284,12 +291,12 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     @property
     def is_dynamic(self):
         """Indicates whether the kernel handles dynamic shapes."""
-        return (self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0)
+        return self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0
 
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
         if kernel_only:
-            return self.kernel_global_source
+            return self.device_kernel_source
         else:
-            assert self.wrapped_source is not None, "Wrapped source is not available"
-            return self.wrapped_source
+            # Wrapper only has host kernel source
+            return self.host_kernel_source
diff --git a/tilelang/jit/adapter/cutedsl/__init__.py b/tilelang/jit/adapter/cutedsl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e25899a1d0e2e4f279b87d5c48e5b40d430e8679
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/__init__.py
@@ -0,0 +1,16 @@
+"""CuTeDSL Backend for TileLang.
+
+This module provides runtime compilation support using NVIDIA's CuTeDSL API.
+"""
+
+__all__ = [
+    "CuTeDSLKernelAdapter",
+    "TLCuTeDSLSourceWrapper",
+    "CuTeDSLLibraryGenerator",
+    "check_cutedsl_available",
+]
+
+from .checks import check_cutedsl_available  # noqa: F401
+from .adapter import CuTeDSLKernelAdapter  # noqa: F401
+from .wrapper import TLCuTeDSLSourceWrapper  # noqa: F401
+from .libgen import CuTeDSLLibraryGenerator  # noqa: F401
diff --git a/tilelang/jit/adapter/cutedsl/adapter.py b/tilelang/jit/adapter/cutedsl/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ab5db4d26ddcfed8bef2223da310b3e86eda97
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/adapter.py
@@ -0,0 +1,368 @@
+from __future__ import annotations
+import logging
+from typing import Any, Callable
+
+import torch
+from tvm import tir
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.engine.param import KernelParam
+from tilelang.jit.adapter.wrapper import TLPyWrapper
+from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available
+from tilelang.jit.adapter.cutedsl.libgen import CuTeDSLLibraryGenerator
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class CuTeDSLKernelAdapter(BaseKernelAdapter):
+    pymodule = None
+
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        check_cutedsl_available()
+
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            self.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        self.param_dtypes = [param.torch_dtype() for param in params]
+        self.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            self.param_shapes.append(native_shape)
+
+        self.dynamic_symbolic_map, self.dynamic_symbolic_order = self._process_dynamic_symbolic()
+
+        self.target = Target.canon_target(determine_target(target))
+        self.verbose = verbose
+        self.wrapper = TLPyWrapper(self.target)
+        self.wrapper.assign_optimized_module(self.ir_module)
+        self.wrapper.assign_pass_configs(pass_configs)
+        self.wrapper.assign_host_module(host_mod)
+        self.wrapper.assign_device_module(device_mod)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
+        self.tma_cpp_init_code = wrapper_result["tma_cpp_init_code"]
+        self.tma_lib_name = wrapper_result["tma_lib_name"]
+        self.launcher_cpp_code = wrapper_result.get("launcher_cpp_code", None)
+        self.launcher_lib_name = wrapper_result.get("launcher_lib_name", None)
+
+        self.lib_generator = CuTeDSLLibraryGenerator(self.target, self.verbose)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
+        self.lib_generator.update_host_func(self.host_func)
+        self.lib_generator.update_tma_cpp_init_code(self.tma_cpp_init_code)
+        self.lib_generator.update_tma_lib_name(self.tma_lib_name)
+        self.lib_generator.update_launcher_cpp_code(self.launcher_cpp_code)
+        self.lib_generator.update_launcher_lib_name(self.launcher_lib_name)
+        self.lib_generator.assign_compile_flags(compile_flags)
+        self.lib_generator.compile_lib()
+        self.lib_generator.load_lib()
+        self.libpath = self.lib_generator.libpath
+        self.device_kernel_source = open(self.libpath).read()
+        self.pymodule = self.lib_generator.pymodule
+
+        self._post_init()
+
+    @classmethod
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            adapter.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
+        adapter.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            adapter.param_shapes.append(native_shape)
+
+        adapter.dynamic_symbolic_map, adapter.dynamic_symbolic_order = adapter._process_dynamic_symbolic()
+
+        adapter.target = Target.canon_target(determine_target(target))
+        adapter.verbose = verbose
+        adapter.lib_generator = CuTeDSLLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator.assign_compile_flags(compile_flags)
+        adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
+        adapter.libpath = kernel_lib_path
+        adapter.kernel_global_source = open(adapter.libpath).read()
+        adapter.pymodule = adapter.lib_generator.pymodule
+
+        adapter._post_init()
+        return adapter
+
+    def _process_dynamic_symbolic(self) -> tuple[dict[tir.Var, tuple[int, int, int]], list[tir.Var]]:
+        """Extract information about dynamic symbols from the TIR function.
+
+        We follow the same ordering semantics as `TLCUDASourceWrapper.get_dynamic_symbolic_set()`:
+        1) dynamic symbols in buffer shapes (in prim_func param order)
+        2) then dynamic symbols in buffer strides
+
+        The mapping encodes:
+        - id=0: shape var -> (0, buffer_param_index, dim_index)
+        - id=1: stride var -> (1, buffer_param_index, stride_index)
+
+        Returns:
+            (dynamic_symbolic_map, dynamic_symbolic_order)
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] = {}
+        dynamic_symbolic_order: list[tir.Var] = []
+
+        def unique_push_back(v: tir.Var, entry: tuple[int, int, int]):
+            if v in dynamic_symbolic_map:
+                return
+            dynamic_symbolic_map[v] = entry
+            dynamic_symbolic_order.append(v)
+
+        # 1) Shapes
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            for j, shape in enumerate(buffer.shape):
+                if isinstance(shape, tir.Var):
+                    unique_push_back(shape, (0, i, j))
+
+        # 2) Strides
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            if buffer.strides is None:
+                continue
+            for j, stride in enumerate(buffer.strides):
+                if isinstance(stride, tir.Var):
+                    unique_push_back(stride, (1, i, j))
+
+        return dynamic_symbolic_map, dynamic_symbolic_order
+
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
+        """Get the CUDA kernel source code.
+
+        Returns
+        -------
+        str | None
+            The kernel source code, or None if not available
+        """
+        return self.device_kernel_source
+
+    def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
+        """Low-level function to call the compiled CUDA kernel."""
+        result = self.pymodule.call(*args, stream=stream)
+
+        # After first call, save cubin to cache if needed
+        self._save_cubin_to_cache_if_needed()
+
+        return result
+
+    def _save_cubin_to_cache_if_needed(self):
+        """Save cubin to cache directory after first execution.
+
+        This is called after the first kernel execution to ensure the generated
+        cubin file is copied to the cache directory for future reuse.
+        """
+        if getattr(self, "_cubin_saved_to_cache", False):
+            return
+        self._cubin_saved_to_cache = True
+
+        # Check if we have a cache path (set by kernel_cache)
+        cache_path = getattr(self, "_cache_path", None)
+        if cache_path is None:
+            return
+
+        import os
+        import shutil
+
+        # Source cubin path (in temp directory)
+        src_py_path = self.libpath
+        src_py_stem = os.path.splitext(os.path.basename(src_py_path))[0]
+        src_dir = os.path.dirname(src_py_path)
+        src_cubin_path = os.path.join(src_dir, f"{src_py_stem}.cubin")
+
+        if not os.path.exists(src_cubin_path):
+            return
+
+        # Destination cubin path (in cache directory)
+        dst_cubin_path = os.path.join(cache_path, "kernel.cubin")
+
+        if os.path.exists(dst_cubin_path):
+            return
+
+        # Copy cubin to cache
+        try:
+            shutil.copy2(src_cubin_path, dst_cubin_path)
+            logger.debug(f"Saved CuTeDSL cubin to cache: {dst_cubin_path}")
+        except Exception as e:
+            logger.warning(f"Failed to save cubin to cache: {e}", exc_info=True)
+
+    def _wrap_forward_from_prebuild_lib(self, *ins: Any, stream: int | None = None):
+        """High-level wrapper for kernel execution.
+
+        Handles:
+        1. Input validation
+        2. Output tensor allocation
+        3. Dynamic shape resolution
+        4. CUDA stream management
+
+        Args:
+            ins: Input arguments (may include scalars and tensors)
+            stream: Optional CUDA stream for asynchronous execution
+
+        Returns:
+            Single tensor or list of tensors containing the kernel results
+        """
+        if len(ins) + len(self.result_idx) != len(self.params):
+            raise ValueError(
+                f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
+            )
+
+        # Materialize args in PrimFunc param order (inputs + allocated outputs)
+        ins_idx = 0
+        param_values: list[Any] = [None] * len(self.params)
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                continue
+            param_values[i] = ins[ins_idx]
+            ins_idx += 1
+
+        first_tensor = next((v for v in param_values if isinstance(v, torch.Tensor)), None)
+        if first_tensor is None:
+            raise ValueError("Expected at least one torch.Tensor argument to infer CUDA device")
+
+        args: list[Any] = []
+
+        # tensor pointers
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                dtype = self.param_dtypes[i]
+                shape = []
+                # Now working with native Python list, no FFI calls needed
+                for s in self.param_shapes[i]:
+                    if isinstance(s, tir.Var):
+                        ref_id, ref_param_idx, ref_dim_idx = self.dynamic_symbolic_map[s]
+                        ref_val = param_values[ref_param_idx]
+                        if not isinstance(ref_val, torch.Tensor):
+                            raise TypeError(f"Dynamic shape/stride var {s} refers to a non-tensor param at index {ref_param_idx}")
+                        if ref_id == 0:
+                            shape.append(ref_val.shape[ref_dim_idx])
+                        elif ref_id == 1:
+                            # Stride vars are not expected in output shapes, but handle defensively.
+                            shape.append(ref_val.stride()[ref_dim_idx])
+                        else:
+                            raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+                    else:  # Already converted to Python int during initialization
+                        shape.append(s)
+                tensor = torch.empty(*shape, dtype=dtype, device=first_tensor.device)
+                param_values[i] = tensor
+            else:
+                tensor = param_values[i]
+            args.append(tensor)
+
+        # dynamic symbolics
+        for sym in self.dynamic_symbolic_order:
+            ref_id, buffer_idx, dim_idx = self.dynamic_symbolic_map[sym]
+            ref_val = param_values[buffer_idx]
+            if not isinstance(ref_val, torch.Tensor):
+                raise TypeError(f"Dynamic symbolic var {sym} refers to a non-tensor param at index {buffer_idx}")
+            if ref_id == 0:
+                args.append(ref_val.shape[dim_idx])
+            elif ref_id == 1:
+                args.append(ref_val.stride()[dim_idx])
+            else:
+                raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+
+        # if stream is not None, we need to pass the stream to the library
+        if stream is None:
+            if str(self.target).startswith("cuda") and torch.cuda.is_available():
+                stream = torch.cuda.current_stream().cuda_stream
+            else:
+                stream = 0
+
+        self._forward_from_prebuild_lib(*args, stream=stream)
+
+        if len(self.result_idx) == 1:
+            return args[self.result_idx[0]]
+        else:
+            return [args[i] for i in self.result_idx]
+
+    def _convert_torch_func(self) -> Callable[..., torch.Tensor | list[torch.Tensor]]:
+        """Convert to a PyTorch-compatible function.
+
+        Returns
+        -------
+        Callable[..., torch.Tensor | list[torch.Tensor]]
+            A callable function that takes tensors and returns tensor(s)
+        """
+        return self._wrap_forward_from_prebuild_lib
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/adapter/cutedsl/checks.py b/tilelang/jit/adapter/cutedsl/checks.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced8ea7c30bc3836e36d1b7bfd833b28fb811e5a
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/checks.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import re
+from importlib import metadata as _importlib_metadata
+from importlib.util import find_spec as _find_spec
+import os
+
+_CUTEDSL_PUBLIC_DIST = "nvidia-cutlass-dsl"
+_CUTEDSL_MIN_VERSION = (4, 3, 1)
+_VERSION_TRIPLE_RE = re.compile(r"(\d+)\.(\d+)\.(\d+)")
+
+
+def _parse_version_triple(version_str: str) -> tuple[int, int, int] | None:
+    """Parse a best-effort (major, minor, patch) triple from a version string.
+
+    We intentionally avoid importing heavy/optional version parsers. For our
+    minimum requirement (>= 4.3.1), a numeric triple comparison is sufficient.
+    """
+    m = _VERSION_TRIPLE_RE.search(version_str)
+    if not m:
+        return None
+    return int(m.group(1)), int(m.group(2)), int(m.group(3))
+
+
+def _min_version_str() -> str:
+    return ".".join(map(str, _CUTEDSL_MIN_VERSION))
+
+
+def _requirement_spec() -> str:
+    return f"{_CUTEDSL_PUBLIC_DIST}>={_min_version_str()}"
+
+
+def check_cutedsl_available() -> None:
+    """Fail fast if the CuTeDSL backend cannot be used in this Python environment.
+
+    Policy:
+    - If the public distribution `nvidia-cutlass-dsl` is installed, require version >= a minimum supported version.
+    - Regardless of distribution metadata, require that `cutlass.cute` is importable.
+
+    This intentionally does not mention or special-case any internal distributions.
+    """
+    # 1) Version gate (only when the public dist metadata is present)
+    try:
+        dist_version = _importlib_metadata.version(_CUTEDSL_PUBLIC_DIST)
+    except _importlib_metadata.PackageNotFoundError:
+        dist_version = None
+    except Exception:
+        # Metadata is best-effort; don't block internal/nonstandard installs here.
+        dist_version = None
+
+    if dist_version is not None:
+        parsed = _parse_version_triple(dist_version)
+        if parsed is None or parsed < _CUTEDSL_MIN_VERSION:
+            req = _requirement_spec()
+            raise ImportError(
+                f"CuTeDSL backend requires `{req}`, but found version `{dist_version}`. Please run: `pip install -U '{req}'`."
+            )
+
+    # 2) Capability probe: keep it cheap.
+    # Importing cutlass/cute can be expensive and defeats our lazy-import design,
+    # especially on cache hits. We only require that the module is importable.
+    cutlass_spec = _find_spec("cutlass")
+    if cutlass_spec is None:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
+
+    # Avoid find_spec("cutlass.cute") which can be surprisingly expensive.
+    # Instead, check for a 'cute' submodule/package under cutlass's search locations.
+    locs = getattr(cutlass_spec, "submodule_search_locations", None)
+    has_cute = False
+    if locs:
+        for base in locs:
+            if os.path.isdir(os.path.join(base, "cute")) or os.path.isfile(os.path.join(base, "cute.py")):
+                has_cute = True
+                break
+
+    if not has_cute:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
diff --git a/tilelang/jit/adapter/cutedsl/libgen.py b/tilelang/jit/adapter/cutedsl/libgen.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dac6b141a08b1e53a8159e4a597c4e96c56b1af
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/libgen.py
@@ -0,0 +1,124 @@
+"""CuTeDSL Library Generator for TileLang.
+
+This module provides library generation functionality for the CuTeDSL backend.
+"""
+
+from __future__ import annotations
+import importlib.util
+import os
+import tempfile
+import subprocess
+
+from tvm.target import Target
+
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cutedsl_target
+
+
+class CuTeDSLLibraryGenerator(LibraryGenerator):
+    host_func: str | None = None
+    tma_cpp_init_code: str | None = None
+    tma_lib_name: str | None = None
+    launcher_cpp_code: str | None = None
+    launcher_lib_name: str | None = None
+    pymodule = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        self.host_func = host_func
+
+    def update_tma_cpp_init_code(self, tma_cpp_init_code: str):
+        self.tma_cpp_init_code = tma_cpp_init_code
+
+    def update_tma_lib_name(self, tma_lib_name: str):
+        self.tma_lib_name = tma_lib_name
+
+    def update_launcher_cpp_code(self, launcher_cpp_code: str):
+        self.launcher_cpp_code = launcher_cpp_code
+
+    def update_launcher_lib_name(self, launcher_lib_name: str):
+        self.launcher_lib_name = launcher_lib_name
+
+    def load_lib(self, lib_path: str | None = None):
+        if lib_path is None:
+            if self.libpath is None:
+                raise RuntimeError("CuTeDSLLibraryGenerator.libpath is not set; call compile_lib() first or pass lib_path explicitly.")
+            lib_path = self.libpath
+
+        self.pymodule = self.import_from_file("kernel", lib_path)
+
+    def compile_lib(self, timeout: float = None):
+        if self.host_func is None:
+            raise RuntimeError("CuTeDSLLibraryGenerator.host_func is not set; call update_host_func() before compile_lib().")
+        target = self.target
+        if is_cutedsl_target(target):
+            # Use a dedicated temp directory per kernel so CuTeDSL artifacts (e.g. kept .cubin)
+            # never pollute user CWD, and are easy to locate alongside the generated module.
+            work_dir = tempfile.mkdtemp(prefix="tilelang_cutedsl_")
+            src_path = os.path.join(work_dir, "kernel.py")
+            with open(src_path, "w") as f:
+                # Note: lib_code (containing @cute.kernel definitions) is embedded
+                # inside host_func's _generate_cubin_if_needed function, so we only
+                # write host_func here. This ensures cute imports are lazy-loaded.
+                f.write(self.host_func)
+
+            # Compile C++ launcher library if needed
+            if self.launcher_cpp_code is not None:
+                with tempfile.NamedTemporaryFile(
+                    mode="w",
+                    suffix=".cpp",
+                    delete=False,
+                ) as launcher_src:
+                    launcher_src.write(self.launcher_cpp_code)
+                    launcher_src_path = launcher_src.name
+
+                # Generate launcher lib under the same directory as the source file
+                launcher_lib_path = os.path.join(os.path.dirname(src_path), self.launcher_lib_name)
+
+                # Get TVM FFI compiler flags using tvm_ffi.libinfo API
+                try:
+                    import tvm_ffi.libinfo
+
+                    include_paths = tvm_ffi.libinfo.include_paths()
+                    tvm_cxxflags = [f"-I{path}" for path in include_paths]
+                    lib_path = tvm_ffi.libinfo.find_libtvm_ffi()
+                    lib_dir = os.path.dirname(lib_path)
+                    tvm_ldflags = [f"-L{lib_dir}", "-ltvm_ffi"]
+                except (ImportError, RuntimeError):
+                    # tvm_ffi unavailable or libinfo functions failed
+                    tvm_cxxflags = []
+                    tvm_ldflags = []
+
+                # Compile with nvcc (need CUDA driver API)
+                compile_cmd = [
+                    "nvcc",
+                    "-shared",
+                    "-Xcompiler=-fPIC",
+                    "-lcuda",
+                    *tvm_cxxflags,
+                    *tvm_ldflags,
+                    "-o",
+                    launcher_lib_path,
+                    launcher_src_path,
+                ]
+
+                result = subprocess.run(compile_cmd, check=False, capture_output=True, text=True, timeout=timeout)
+                if result.returncode != 0:
+                    raise RuntimeError(f"Failed to compile C++ launcher: {result.stderr}")
+
+                self.launcher_libpath = launcher_lib_path
+                self.launcher_libname = self.launcher_lib_name
+
+            self.srcpath = src_path
+            self.libpath = src_path
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/jit/adapter/cutedsl/wrapper.py b/tilelang/jit/adapter/cutedsl/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..c20d2ec67983aecec5a0270d0cd19ef7dd24d6d9
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/wrapper.py
@@ -0,0 +1,1354 @@
+"""CuTeDSL Source Wrapper for TileLang.
+
+This module provides C++ kernel launcher generation for the CuTeDSL backend.
+
+Key features:
+- Automatic C++ launcher generation with CUDA Driver API
+- TMA descriptors on HOST memory, passed via __grid_constant__ (no device copy needed)
+- cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+- Support for single and multiple kernel launches
+- Complete cache system integration
+"""
+
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import (
+    extract_python_func_declaration,
+    pythonic_expr,
+    parse_tma_descriptor_args,
+)
+
+# =============================================================================
+# C++ LAUNCHER TEMPLATES (using named parameters for clarity)
+# =============================================================================
+
+# TMA single descriptor initialization template (writes to caller-provided host array)
+# No device copy needed - cuLaunchKernel handles __grid_constant__ params automatically
+CPP_TMA_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name})
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t boxDim[{rank}] = {{{box_dim_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+
+    result = cuTensorMapEncodeTiled(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        boxDim,
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA single im2col descriptor initialization template (writes to caller-provided host array)
+# Align field ordering with NVRTC wrapper (cuTensorMapEncodeIm2col signature).
+CPP_TMA_IM2COL_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name}) [im2col]
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+    int32_t lowerCorner[{rank_minus_two}] = {{{lower_corner_values}}};
+    int32_t upperCorner[{rank_minus_two}] = {{{upper_corner_values}}};
+
+    result = cuTensorMapEncodeIm2col(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        lowerCorner,
+        upperCorner,
+        static_cast<uint32_t>({channels_per_pixel}),
+        static_cast<uint32_t>({pixels_per_column}),
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA im2col descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA initialization function template (writes to caller-provided host array)
+# __grid_constant__ allows kernel to receive TMA descriptor by value via param space
+CPP_TMA_INIT_FUNC_TEMPLATE = """\
+CUresult tma_init(CUtensorMap* tma_descs, {func_args}) {{
+  // Initialize {num_descs} TMA descriptor(s) in caller-provided host array
+  // cuLaunchKernel will copy 128-byte CUtensorMap to kernel param space automatically
+  CUresult result;
+
+{desc_init_code}
+
+  return CUDA_SUCCESS;
+}}
+"""
+
+# Kernel initialization template
+CPP_KERNEL_INIT_TEMPLATE = """\
+  // Find and configure kernel {kernel_idx}: {kernel_name}
+  result = find_kernel_by_pattern(g_module, "{kernel_name}", &g_kernels[{kernel_idx}]);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to find kernel {kernel_name}: " << result << "\\n";
+    return result;
+  }}
+
+  if ({smem_size} > 0) {{
+    result = cuFuncSetAttribute(g_kernels[{kernel_idx}],
+                                CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                                {smem_size});
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to set smem for {kernel_name}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA launch initialization template (host memory mode - uses __grid_constant__)
+# Kernel receives TMA descriptor by value: .param .align 128 .b8 xxx_param[128]
+CPP_TMA_LAUNCH_INIT_TEMPLATE = """\
+  // Declare stack-local TMA descriptor array (eliminates concurrency race)
+  CUtensorMap tma_descs[{num_tma_descs}];
+
+  // Initialize TMA descriptors (HOST memory - passed via __grid_constant__)
+  // NOTE: We intentionally do NOT reuse/cached descriptors across launches.
+  // Pointer-only reuse is a correctness trap (shape/stride may change with same ptr),
+  // and correctness beats micro-optimizations.
+  result = tma_init(tma_descs, {tma_tensor_args});
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to initialize TMA descriptors: " << result << "\\n";
+    return result;
+  }}
+"""
+
+# Kernel launch template
+CPP_KERNEL_LAUNCH_TEMPLATE = """\
+  // Launch kernel {kernel_idx}: {kernel_name}
+  {{
+    void* args[] = {{{kernel_args}}};
+    result = cuLaunchKernel(
+        g_kernels[{kernel_idx}],
+        {grid_x}, {grid_y}, {grid_z},
+        {block_x}, {block_y}, {block_z},
+        {smem_size},
+        stream,
+        args,
+        nullptr
+    );
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to launch kernel {kernel_name}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# Complete C++ launcher template
+CPP_LAUNCHER_TEMPLATE = """\
+#include <cuda.h>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+#include <string>
+
+// TVM Headers
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+
+// Cached module handle
+static CUmodule g_module = nullptr;
+static bool g_module_initialized = false;
+
+// Cached kernel functions
+static CUfunction g_kernels[{num_kernels}] = {{nullptr}};
+static bool g_kernels_initialized = false;
+
+// Find kernel by pattern (substring match, prefer base name over _N variants)
+CUresult find_kernel_by_pattern(CUmodule module, const char* pattern, CUfunction* out_func) {{
+  CUresult result;
+  unsigned int num_funcs = 0;
+
+  result = cuModuleGetFunctionCount(&num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to get function count: " << result << "\\n";
+    return result;
+  }}
+
+  std::vector<CUfunction> func_list(num_funcs);
+  result = cuModuleEnumerateFunctions(func_list.data(), num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to enumerate functions: " << result << "\\n";
+    return result;
+  }}
+
+  // Collect substring matches, separating base name from _N variants
+  std::vector<std::pair<std::string, CUfunction>> base_matches;     // pattern not followed by _digit
+  std::vector<std::pair<std::string, CUfunction>> variant_matches;  // pattern followed by _digit
+
+  size_t pattern_len = std::strlen(pattern);
+
+  for (unsigned int i = 0; i < num_funcs; i++) {{
+    const char* func_name = nullptr;
+    result = cuFuncGetName(&func_name, func_list[i]);
+    if (result != CUDA_SUCCESS || func_name == nullptr) {{
+      std::cerr << "Failed to get function name: " << result << "\\n";
+      return result;
+    }}
+
+    std::string name_str(func_name);
+    size_t pos = name_str.find(pattern);
+
+    if (pos != std::string::npos) {{
+      // Found substring match
+      size_t after_pattern = pos + pattern_len;
+
+      // Check what follows the pattern
+      if (after_pattern < name_str.length() &&
+          name_str[after_pattern] == '_' &&
+          after_pattern + 1 < name_str.length() &&
+          std::isdigit(name_str[after_pattern + 1])) {{
+        // Pattern followed by _digit (e.g., "main_kernel_1")
+        variant_matches.push_back({{name_str, func_list[i]}});
+      }} else {{
+        // Pattern not followed by _digit (e.g., "main_kernel" itself)
+        base_matches.push_back({{name_str, func_list[i]}});
+      }}
+    }}
+  }}
+
+  // Decision logic: prefer base matches over variant matches
+  if (!base_matches.empty()) {{
+    if (base_matches.size() == 1) {{
+      *out_func = base_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple base matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << base_matches.size()
+              << " base kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : base_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern.\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No base matches, try variant matches
+  if (!variant_matches.empty()) {{
+    if (variant_matches.size() == 1) {{
+      *out_func = variant_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple variant matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << variant_matches.size()
+              << " variant kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : variant_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern (e.g., '" << pattern << "_1').\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No matches at all
+  std::cerr << "Failed to find kernel matching pattern '" << pattern << "'\\n";
+  return CUDA_ERROR_NOT_FOUND;
+}}
+
+
+// Initialize CUDA module (called once on first launch)
+static CUresult tilelang_init_cuda_module(const std::string& cubin_path) {{
+  if (g_module_initialized) return CUDA_SUCCESS;
+
+  CUresult result;
+  result = cuInit(0);
+  if (result != CUDA_SUCCESS) return result;
+
+  std::ifstream cubin_file(cubin_path.c_str(), std::ios::binary);
+  if (!cubin_file) {{
+    std::cerr << "Failed to open cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_FILE_NOT_FOUND;
+  }}
+
+  std::vector<char> cubin_data((std::istreambuf_iterator<char>(cubin_file)),
+                                std::istreambuf_iterator<char>());
+  cubin_file.close();
+
+  if (cubin_data.empty()) {{
+    std::cerr << "Empty cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_INVALID_IMAGE;
+  }}
+
+  result = cuModuleLoadData(&g_module, cubin_data.data());
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to load CUDA module: " << result << "\\n";
+    return result;
+  }}
+
+  g_module_initialized = true;
+  return CUDA_SUCCESS;
+}}
+
+// Initialize all kernel functions (called once after module load)
+static CUresult tilelang_init_kernels() {{
+  if (g_kernels_initialized) return CUDA_SUCCESS;
+  CUresult result;
+
+{kernel_inits}
+
+  g_kernels_initialized = true;
+  return CUDA_SUCCESS;
+}}
+
+// TMA descriptor initialization (host-side)
+{tma_init_func}
+
+// Main kernel launcher
+extern "C" CUresult launch_kernel({launch_func_sig}, uint64_t _stream, tvm::ffi::Bytes cubin_path) {{
+  CUresult result;
+
+  std::string cubin_path_str(reinterpret_cast<const char*>(cubin_path.data()), cubin_path.size());
+  result = tilelang_init_cuda_module(cubin_path_str);
+  if (result != CUDA_SUCCESS) return result;
+
+  result = tilelang_init_kernels();
+  if (result != CUDA_SUCCESS) return result;
+
+{get_ptr_code}
+  CUstream stream = (CUstream)_stream;
+
+{tma_init_in_launch}
+
+{kernel_launches}
+
+  return CUDA_SUCCESS;
+}}
+
+// Cleanup function
+extern "C" CUresult cleanup_module() {{
+  if (g_module_initialized && g_module != nullptr) {{
+    cuModuleUnload(g_module);
+    g_module = nullptr;
+    g_module_initialized = false;
+  }}
+
+  g_kernels_initialized = false;
+
+  return CUDA_SUCCESS;
+}}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(launch_kernel, launch_kernel);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(cleanup_module, cleanup_module);
+"""
+
+# =============================================================================
+# PYTHON CUBIN GENERATION TEMPLATES
+# =============================================================================
+
+# TMA descriptor atom initialization template
+CUBIN_TMA_ATOM_INIT_TEMPLATE = """\
+    {desc_name} = tl.Gemm_SM90.get_tma_atom(__fake_tensor__, (32, 32))"""
+
+# Kernel launch call template
+CUBIN_KERNEL_LAUNCH_TEMPLATE = """\
+    {function_name}({call_args}).launch(
+      grid=[{grid_x}, {grid_y}, {grid_z}],
+      block=[{block_x}, {block_y}, {block_z}],
+      smem={smem_size},
+      stream=stream,
+    )"""
+
+# Fake tensor creation template
+CUBIN_FAKE_TENSOR_TEMPLATE = """\
+  __fake_{arg_name}__ = make_fake_compact_tensor(_DTYPE_MAP[str({arg_name}.dtype)], {arg_name}.shape, stride_order={arg_name}.dim_order()[::-1], assumed_align=16)"""
+
+# Complete cubin generation code template
+# {lib_code} contains the @cute.kernel definitions and is embedded here
+CUBIN_GEN_CODE_TEMPLATE = """\
+{lib_code}
+
+  @cute.jit
+  def kernel_wrapper({wrapper_args}):
+{tma_init_code}{kernel_launches}
+
+  # Compile kernels to generate cubin
+{fake_tensor_code}{fake_tma_tensor_code}  __fake_stream__ = make_fake_stream()
+  # Always generate cubin under a unique staging directory to avoid concurrent
+  # processes clobbering each other's intermediate artifacts.
+  _staging_dir = Path(tempfile.mkdtemp(
+      prefix=Path(__file__).stem + ".cubin.staging.",
+      dir=_module_dir,
+  ))
+  try:
+    _kernel_wrapper = cute.compile(
+        kernel_wrapper,
+        {compile_args},
+        options=f"--enable-tvm-ffi --keep-cubin --dump-dir={{_staging_dir.as_posix()}}",
+    )
+
+    # CuTeDSL generates a long, mangled cubin filename that includes argument/type info,
+    # e.g. "cutlass_kernel_wrapper_FakeTensor...sm_90a.cubin". We expect exactly one cubin.
+    _cubin_files = sorted(_staging_dir.glob("*.cubin"), key=lambda p: p.stat().st_mtime)
+    if len(_cubin_files) != 1:
+      raise RuntimeError(
+          f"Expected exactly one .cubin under {{_staging_dir}}, got {{len(_cubin_files)}}: {{_cubin_files}}"
+      )
+    os.replace(_cubin_files[0], _cubin_path)
+  finally:
+    shutil.rmtree(_staging_dir, ignore_errors=True)"""
+
+# =============================================================================
+# PYTHON HOST FUNCTION TEMPLATE
+# =============================================================================
+
+PYTHON_HOST_FUNC_TEMPLATE = """\
+import os
+from pathlib import Path
+
+# Minimal imports for runtime (no cutlass/cute - only needed for cubin generation)
+import tvm.runtime as runtime
+
+_cpp_launcher = None
+_cpp_launcher_lib = None
+_cubin_generated = False
+
+# Pre-compute paths - cubin is stored alongside the launcher .so
+# Use module basename to avoid conflicts when multiple kernels run concurrently
+# e.g., "/tmp/tmp8liu__ho.py" -> "/tmp/tmp8liu__ho.cubin"
+#       "kernel.py" (in cache) -> "kernel.cubin"
+_module_dir = Path(os.path.dirname(__file__))
+_cubin_path = _module_dir / (Path(__file__).stem + ".cubin")
+_cubin_path_bytes = _cubin_path.as_posix().encode('utf-8')
+_cubin_needs_generation = not _cubin_path.exists()
+
+def _generate_cubin_if_needed({cubin_gen_params}):
+  \"\"\"Generate cubin file on first call.
+
+  All CuTeDSL imports are inside this function to avoid slow
+  module-level initialization when loading from cache.
+  \"\"\"
+  global _cubin_generated, _cubin_path
+
+  # Lazy import CuTeDSL only when cubin generation is needed
+  from cuda.bindings.driver import CUstream
+  import cutlass
+  import cutlass.cute as cute
+  from cutlass.cute.runtime import make_fake_stream, make_fake_compact_tensor
+  import tilelang.contrib.cutedsl as tl
+  # We rely on CuTeDSL's keep-cubin artifact rather than custom extraction.
+  import tempfile
+  import shutil
+
+  _DTYPE_MAP = {{
+      "torch.float32": cutlass.Float32,
+      "torch.float16": cutlass.Float16,
+      "torch.bfloat16": cutlass.BFloat16,
+      "torch.float8_e4m3fnuz": cutlass.Float8E4M3FN,
+      "torch.float8_e4m3fn": cutlass.Float8E4M3FN,
+      "torch.float8_e5m2": cutlass.Float8E5M2,
+      "torch.float64": cutlass.Float64,
+      "torch.int64": cutlass.Int64,
+      "torch.int32": cutlass.Int32,
+      "torch.uint32": cutlass.Uint32,
+      "torch.bool": cutlass.Boolean,
+      "torch.int8": cutlass.Int8,
+      "torch.uint8": cutlass.Uint8,
+      "torch.int16": cutlass.Int16,
+      "torch.uint16": cutlass.Uint16,
+      "torch.uchar": cutlass.Uint8,
+  }}
+
+{cubin_gen_code}
+
+  _cubin_generated = True
+
+def _load_cpp_launcher():
+  \"\"\"Load C++ kernel launcher.\"\"\"
+  global _cpp_launcher, _cpp_launcher_lib
+  if _cpp_launcher is not None:
+    return _cpp_launcher
+
+  lib_path = os.path.join(os.path.dirname(__file__), "{launcher_lib_name}")
+  if not os.path.exists(lib_path):
+    raise FileNotFoundError(f"Launcher not found: {{lib_path}}")
+
+  _cpp_launcher_lib = runtime.load_module(lib_path)
+  _cpp_launcher = _cpp_launcher_lib["launch_kernel"]
+  return _cpp_launcher
+
+def call({call_func_params}, stream):
+  \"\"\"Kernel dispatch function.\"\"\"
+  global _cubin_path_bytes, _cubin_needs_generation
+
+  if _cubin_needs_generation:
+    _generate_cubin_if_needed({cubin_gen_call_args})
+    _cubin_needs_generation = False
+
+{arg_prep_code}
+
+  launcher = _load_cpp_launcher()
+  result = launcher({launcher_call_args}, stream, _cubin_path_bytes)
+
+  if result != 0:
+    raise RuntimeError(f"Kernel launch failed with CUDA error {{result}}")
+"""
+
+# =============================================================================
+# WRAPPER CLASS
+# =============================================================================
+
+
+class TLCuTeDSLSourceWrapper(TLCUDASourceWrapper):
+    """Wrapper class for TileLang CuTe DSL backend with C++ launcher.
+
+    Generates optimized C++ launcher code that:
+    - Loads cubin via CUDA Driver API
+    - Passes TMA descriptors by value (host-side, no device copy)
+    - Launches kernels with minimal Python overhead
+    - Supports both single and multiple kernel scenarios
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "cutlass.Float32",
+        "float16": "cutlass.Float16",
+        "bfloat16": "cutlass.BFloat16",
+        "float8_e4m3": "cutlass.Float8E4M3",
+        "float8_e5m2": "cutlass.Float8E5M2",
+        "float64": "cutlass.Float64",
+        "int64": "cutlass.Int64",
+        "int32": "cutlass.Int32",
+        "uint32": "cutlass.Uint32",
+        "bool": "cutlass.Boolean",
+        "int8": "cutlass.Int8",
+        "uint8": "cutlass.Uint8",
+        "int16": "cutlass.Int16",
+        "uint16": "cutlass.Uint16",
+        "uchar": "cutlass.Uint8",
+    }
+
+    # C++ launcher code must not depend on cutlass Python types.
+    # Use plain C/C++ types for expression rendering inside generated .cpp.
+    _CXX_TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "float",
+        "float64": "double",
+        "int64": "int64_t",
+        "int32": "int32_t",
+        "uint32": "uint32_t",
+        "bool": "bool",
+        "int8": "int8_t",
+        "uint8": "uint8_t",
+        "int16": "int16_t",
+        "uint16": "uint16_t",
+    }
+
+    _CTYPES_MAP: ClassVar[dict[str, str]] = {
+        "buffer": "ctypes.c_uint64",
+        "cutlass.Float32": "ctypes.c_float",
+        "cutlass.Float16": "ctypes.c_uint16",
+        "cutlass.Float64": "ctypes.c_double",
+        "cutlass.Int64": "ctypes.c_int64",
+        "cutlass.Int32": "ctypes.c_int32",
+        "cutlass.Uint32": "ctypes.c_uint32",
+        "cutlass.Int8": "ctypes.c_int8",
+        "cutlass.Uint8": "ctypes.c_uint8",
+        "cutlass.Int16": "ctypes.c_int16",
+        "cutlass.Uint16": "ctypes.c_uint16",
+        "int": "ctypes.c_int32",
+    }
+
+    _generated_host_func: str | None = None
+    _launcher_lib_name: str | None = None
+
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    # =========================================================================
+    # Properties
+    # =========================================================================
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    # =========================================================================
+    # Utility Methods
+    # =========================================================================
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string."""
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="//")
+
+    def _cxx_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to C++ string for generated launcher code."""
+        return pythonic_expr(expr, self._CXX_TYPE_MAP)
+
+    @staticmethod
+    def _cxx_cast(ctype: str, expr_str: str) -> str:
+        return f"static_cast<{ctype}>({expr_str})"
+
+    def _collect_function_args(self) -> tuple[list[dict], list[str]]:
+        """Collect all function arguments from primary function.
+
+        Returns:
+            Tuple of (function_args, buffer_args)
+        """
+        function_args = []
+        buffer_args = []
+
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append({"name": buffer.data.name, "type": "buffer"})
+                buffer_args.append(buffer.data.name)
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._TYPE_MAP[param.dtype]})
+            else:
+                raise ValueError(f"Parameter {param} not in buffer map")
+
+        existing_names = {arg["name"] for arg in function_args}
+        for dyn_sym in self.get_dynamic_symbolic_set(self.prim_func):
+            dyn_sym_name, dyn_sym_dtype = dyn_sym if isinstance(dyn_sym, tuple) else (dyn_sym, "int32")
+            if dyn_sym_name in existing_names:
+                continue
+            existing_names.add(dyn_sym_name)
+            function_args.append({"name": dyn_sym_name, "type": self._TYPE_MAP.get(dyn_sym_dtype, "int")})
+
+        return function_args, buffer_args
+
+    @staticmethod
+    def _extract_func_call_args(
+        declaration: str,
+        function_args: list[dict],
+        function_params: list,
+        desc_name_map: dict[str, str] | None = None,
+        desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    ) -> list[tuple[str, str]]:
+        """Extract function call arguments from Python function declaration."""
+
+        def maybe_desc(name: str | tuple[str, str], param_names: list[str], i: int):
+            name_str = name if isinstance(name, str) else name[0]
+            param = param_names[i]
+            if not (param == name_str + "_desc" or param.startswith(name_str + "_desc_")):
+                return False
+            if desc_name_map is not None:
+                desc_name_map[param] = name_str
+            return True
+
+        def extract_param_names_ast(decl: str) -> list[str] | None:
+            """Extract parameter names using AST parsing."""
+            import ast
+            import warnings
+
+            try:
+                # Build a syntactically valid function by adding a body
+                func_stub = decl.rstrip()
+                if not func_stub.endswith(":"):
+                    func_stub += ":"
+                func_stub += "\n    pass"
+
+                # Parse and locate the FunctionDef
+                tree = ast.parse(func_stub)
+                func_def = None
+                for node in ast.walk(tree):
+                    if isinstance(node, ast.FunctionDef):
+                        func_def = node
+                        break
+
+                if func_def is None:
+                    return None
+
+                # Extract parameter names, skipping 'self'
+                param_names = []
+                for arg in func_def.args.args:
+                    if arg.arg != "self":
+                        param_names.append(arg.arg)
+
+                return param_names
+            except Exception as e:
+                warnings.warn(f"AST parsing failed for function declaration, falling back to split-based parsing: {e}", stacklevel=2)
+                return None
+
+        def extract_param_names_split(decl: str) -> list[str]:
+            """Fallback: extract parameter names using naive split-based parsing."""
+            paren_start = decl.find("(")
+            paren_end = decl.rfind(")")
+            if paren_start == -1 or paren_end == -1:
+                return []
+
+            params_str = decl[paren_start + 1 : paren_end].strip()
+            if not params_str:
+                return []
+
+            param_parts = params_str.split(",")
+            param_names = []
+            for param in param_parts:
+                param = param.strip()
+                if not param or param == "self":
+                    continue
+                if ":" in param:
+                    param_name = param.split(":")[0].strip()
+                else:
+                    param_name = param.strip()
+                param_names.append(param_name)
+
+            return param_names
+
+        # Try AST-based extraction first, fallback to split-based
+        param_names = extract_param_names_ast(declaration)
+        if param_names is None:
+            param_names = extract_param_names_split(declaration)
+
+        call_args = []
+        for i, param_name in enumerate(param_names):
+            for arg in function_args:
+                if arg["name"] == param_name:
+                    call_args.append((param_name, arg["type"]))
+                elif maybe_desc(arg["name"], param_names, i):
+                    call_args.append((param_name, "None"))
+                    if desc_name_var_map is not None and function_params is not None:
+                        assert len(call_args) <= len(function_params)
+                        desc_name_var_map[param_name] = function_params[len(call_args) - 1]
+        return call_args
+
+    @staticmethod
+    def _filter_non_descriptor_args(
+        call_args: list[tuple[str, str]], desc_names: list[str], tma_tensors: list[str]
+    ) -> list[tuple[str, str]]:
+        """Filter out descriptor arguments."""
+        filtered = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                continue
+            if arg_name in tma_tensors:
+                continue
+            filtered.append((arg_name, arg_type))
+        return filtered
+
+    # =========================================================================
+    # TMA Descriptor Code Generation
+    # =========================================================================
+
+    def _generate_tma_desc_init(self, desc_name: str, desc_idx: int, tensor_name: str, info: dict) -> str:
+        """Generate single TMA descriptor initialization code."""
+        if info.get("is_img2col", False):
+            rank = info["tensor_rank"]
+            return CPP_TMA_IM2COL_DESC_INIT_TEMPLATE.format(
+                desc_idx=desc_idx,
+                desc_name=desc_name,
+                tensor_name=tensor_name,
+                rank=rank,
+                stride_rank=rank - 1,
+                rank_minus_two=rank - 2,
+                global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+                global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+                elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+                lower_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["lower_corner"]),
+                upper_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["upper_corner"]),
+                # Match NVRTC wrapper naming: channelsPerPixel then pixelsPerColumn
+                channels_per_pixel=info["smem_box_channel"],
+                pixels_per_column=info["smem_box_pixel"],
+                dtype=info["dtype"],
+                interleave=info["interleave"],
+                swizzle=info["swizzle"],
+                l2_promotion=info["l2Promotion"],
+                oob_fill=info["oobFill"],
+            )
+
+        return CPP_TMA_DESC_INIT_TEMPLATE.format(
+            desc_idx=desc_idx,
+            desc_name=desc_name,
+            tensor_name=tensor_name,
+            rank=info["tensor_rank"],
+            global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+            stride_rank=info["tensor_rank"] - 1,
+            global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+            box_dim_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["box_dim"]),
+            elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+            dtype=info["dtype"],
+            interleave=info["interleave"],
+            swizzle=info["swizzle"],
+            l2_promotion=info["l2Promotion"],
+            oob_fill=info["oobFill"],
+        )
+
+    def _generate_tma_init_func(
+        self,
+        desc_names: list[str],
+        tensor_args: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+        scalar_args: list[dict[str, str]],
+    ) -> str:
+        """Generate TMA init function code (creates descriptors in caller-provided host array).
+
+        TMA descriptors are stored in stack-local tma_descs[] array in launch_kernel.
+        cuLaunchKernel automatically handles __grid_constant__ params.
+        """
+        if not desc_names:
+            return ""
+
+        func_args_parts = [f"uint64_t {arg}_ptr" for arg in tensor_args]
+        for arg in scalar_args:
+            if arg["type"] in ["int", "cutlass.Int32"]:
+                func_args_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_args_parts.append(f"float {arg['name']}")
+            else:
+                # Default to int32_t for scalars used in shape/stride math
+                func_args_parts.append(f"int32_t {arg['name']}")
+        func_args = ", ".join(func_args_parts)
+        num_descs = len(desc_names)
+
+        desc_inits = []
+        for idx, desc_name in enumerate(desc_names):
+            info = self.tma_desc_info[desc_name]
+            tensor_name, _ = tensor_arg_map[desc_name]
+            desc_inits.append(self._generate_tma_desc_init(desc_name, idx, tensor_name, info))
+
+        return CPP_TMA_INIT_FUNC_TEMPLATE.format(
+            func_args=func_args,
+            num_descs=num_descs,
+            desc_init_code="\n".join(desc_inits),
+        )
+
+    def _generate_tma_launch_init(
+        self, desc_names: list[str], tma_tensors: list[str], scalar_args: list[dict[str, str]], num_tma_descs: int
+    ) -> str:
+        """Generate TMA initialization code for launch function (host memory mode).
+
+        TMA descriptors stay on host. cuLaunchKernel copies them to param space
+        when kernel uses __grid_constant__ CUtensorMap parameter.
+        """
+        if not desc_names:
+            return ""
+
+        # Generate tma_init call args (no device_ptr needed)
+        call_args_parts = [f"{arg}_ptr" for arg in tma_tensors] + [arg["name"] for arg in scalar_args]
+        tma_tensor_args = ", ".join(call_args_parts)
+
+        return CPP_TMA_LAUNCH_INIT_TEMPLATE.format(
+            num_tma_descs=num_tma_descs,
+            tma_tensor_args=tma_tensor_args,
+        )
+
+    # =========================================================================
+    # Kernel Code Generation
+    # =========================================================================
+
+    def _generate_kernel_init(self, kernel_idx: int, kernel_name: str, smem_size: int) -> str:
+        """Generate kernel initialization code."""
+        return CPP_KERNEL_INIT_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_name,
+            smem_size=smem_size,
+        )
+
+    def _generate_kernel_launch(self, kernel_meta: dict, kernel_idx: int, all_desc_names: list[str]) -> str:
+        """Generate single kernel launch code.
+
+        For __grid_constant__ CUtensorMap params:
+        - Pass CUtensorMap* directly (not CUtensorMap**)
+        - cuLaunchKernel copies 128 bytes to kernel param space
+        """
+        call_args = kernel_meta["call_args"]
+        desc_names = kernel_meta["desc_names"]
+        function_info = kernel_meta["function_info"]
+
+        # Build kernel args
+        kernel_args = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                # For __grid_constant__ CUtensorMap: pass host pointer directly
+                # cuLaunchKernel will copy 128-byte CUtensorMap to param space
+                desc_idx = all_desc_names.index(arg_name)
+                kernel_args.append(f"&tma_descs[{desc_idx}]")
+            elif arg_type == "buffer":
+                kernel_args.append(f"&{arg_name}_ptr")
+            else:
+                kernel_args.append(f"&{arg_name}")
+
+        grid = function_info["grid_info"]
+        block = function_info["block_info"]
+        smem_size = function_info["dynamic_smem_buf"] or 0
+
+        return CPP_KERNEL_LAUNCH_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_meta["function_name"],
+            kernel_args=", ".join(kernel_args),
+            grid_x=self._cxx_expr(grid[0]),
+            grid_y=self._cxx_expr(grid[1]),
+            grid_z=self._cxx_expr(grid[2]),
+            block_x=self._cxx_expr(block[0]),
+            block_y=self._cxx_expr(block[1]),
+            block_z=self._cxx_expr(block[2]),
+            smem_size=smem_size,
+        )
+
+    # =========================================================================
+    # C++ Launcher Generation
+    # =========================================================================
+
+    def _generate_cpp_launcher(
+        self,
+        kernel_metadata_list: list[dict],
+        function_args: list[dict],
+        all_tma_tensors: list[str],
+        all_desc_names: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+    ) -> str:
+        """Generate complete C++ launcher code using templates.
+
+        TMA descriptors are stored on HOST memory in stack-local tma_descs[] array.
+        cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+        when kernel uses __grid_constant__ parameter.
+        """
+        num_kernels = len(kernel_metadata_list)
+        num_tma_descs = max(len(all_desc_names), 1)  # At least 1 to avoid zero-size array
+
+        # Generate kernel inits
+        kernel_inits = "\n".join(
+            self._generate_kernel_init(idx, km["function_name"], km["function_info"]["dynamic_smem_buf"] or 0)
+            for idx, km in enumerate(kernel_metadata_list)
+        )
+
+        # Generate TMA init function
+        scalar_args = [arg for arg in function_args if arg["type"] != "buffer"]
+        tma_init_func = self._generate_tma_init_func(all_desc_names, all_tma_tensors, tensor_arg_map, scalar_args)
+
+        # Generate launch function signature and get_ptr code
+        func_sig_parts = []
+        get_ptr_code = ""
+        for arg in function_args:
+            if arg["type"] == "buffer":
+                func_sig_parts.append(f"tvm::ffi::TensorView {arg['name']}")
+                get_ptr_code += f"  uint64_t {arg['name']}_ptr = reinterpret_cast<uint64_t>({arg['name']}.data_ptr());\n"
+            elif arg["type"] in ["int", "cutlass.Int32"]:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_sig_parts.append(f"float {arg['name']}")
+            else:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+
+        # Generate TMA init in launch
+        tma_init_in_launch = self._generate_tma_launch_init(all_desc_names, all_tma_tensors, scalar_args, num_tma_descs)
+
+        # Generate kernel launches
+        kernel_launches = "\n".join(self._generate_kernel_launch(km, idx, all_desc_names) for idx, km in enumerate(kernel_metadata_list))
+
+        return CPP_LAUNCHER_TEMPLATE.format(
+            num_kernels=num_kernels,
+            num_tma_descs=num_tma_descs,
+            kernel_inits=kernel_inits,
+            tma_init_func=tma_init_func,
+            launch_func_sig=", ".join(func_sig_parts),
+            get_ptr_code=get_ptr_code,
+            tma_init_in_launch=tma_init_in_launch,
+            kernel_launches=kernel_launches,
+        )
+
+    # =========================================================================
+    # Python Wrapper Generation
+    # =========================================================================
+
+    def _generate_cubin_gen_code(
+        self,
+        kernel_metadata_list: list[dict],
+        buffer_args: list[str],
+        all_desc_names: list[str],
+        lib_code: str = "",
+    ) -> str:
+        """Generate cubin generation code for Python wrapper using templates.
+
+        Args:
+            lib_code: The CuTeDSL kernel definitions (@cute.kernel decorated functions).
+                      This will be embedded inside _generate_cubin_if_needed to enable
+                      lazy loading of cutlass/cute modules.
+        """
+        # Build unified wrapper parameters
+        wrapper_params_union = []
+        for kernel_meta in kernel_metadata_list:
+            for arg_name, _ in kernel_meta["call_args"]:
+                if arg_name not in wrapper_params_union:
+                    wrapper_params_union.append(arg_name)
+
+        # Build inner args for cute.compile
+        inner_args = []
+        fake_inner_args = []
+        for arg_name in wrapper_params_union:
+            if arg_name in buffer_args:
+                inner_args.append(f"{arg_name}_")
+                fake_inner_args.append(f"__fake_{arg_name}__")
+            elif arg_name in all_desc_names:
+                continue
+            else:
+                inner_args.append(arg_name)
+                fake_inner_args.append(arg_name)
+        if all_desc_names:
+            inner_args.append("__fake_tensor__")
+            fake_inner_args.append("__fake_tensor__")
+        fake_inner_args.append("__fake_stream__")
+
+        # Generate TMA init code
+        tma_init_code = ""
+        if all_desc_names:
+            tma_init_lines = ["    # Create dummy TMA atoms for compilation"]
+            tma_init_lines.extend(CUBIN_TMA_ATOM_INIT_TEMPLATE.format(desc_name=desc_name) for desc_name in all_desc_names)
+            tma_init_code = "\n".join(tma_init_lines) + "\n"
+
+        # Generate kernel launch calls
+        kernel_launches = "\n".join(
+            CUBIN_KERNEL_LAUNCH_TEMPLATE.format(
+                function_name=km["function_name"],
+                call_args=", ".join(arg[0] if arg[0] not in buffer_args else f"{arg[0]}_" for arg in km["call_args"]),
+                grid_x=self._pythonic_expr(km["function_info"]["grid_info"][0]),
+                grid_y=self._pythonic_expr(km["function_info"]["grid_info"][1]),
+                grid_z=self._pythonic_expr(km["function_info"]["grid_info"][2]),
+                block_x=self._pythonic_expr(km["function_info"]["block_info"][0]),
+                block_y=self._pythonic_expr(km["function_info"]["block_info"][1]),
+                block_z=self._pythonic_expr(km["function_info"]["block_info"][2]),
+                smem_size=km["function_info"]["dynamic_smem_buf"] or 0,
+            )
+            for km in kernel_metadata_list
+        )
+
+        # Generate fake tensor creation code
+        # IMPORTANT: Generate fake tensors based on the *union* of parameters actually
+        # passed to cute.compile (wrapper_params_union).
+        #
+        # In multi-kernel cases, a tensor may appear both as a TMA descriptor
+        # (e.g. Output_partial_desc) for one kernel and as a plain tensor argument
+        # (e.g. Output_partial_) for another kernel. Skipping fake tensor creation
+        # just because a matching "{arg}_desc" exists is a correctness bug and
+        # results in undefined names like "__fake_Output_partial__".
+        fake_tensor_code = "\n".join(
+            CUBIN_FAKE_TENSOR_TEMPLATE.format(arg_name=arg_name) for arg_name in wrapper_params_union if arg_name in buffer_args
+        )
+        if fake_tensor_code:
+            fake_tensor_code += "\n"
+
+        # Generate fake TMA tensor code
+        fake_tma_tensor_code = ""
+        if all_desc_names:
+            fake_tma_tensor_code = (
+                "  __fake_tensor__ = make_fake_compact_tensor(cutlass.Int32, (32, 32), stride_order=(1, 0), assumed_align=16)\n"
+            )
+
+        # Indent lib_code to be inside the function
+        indented_lib_code = "\n".join("  " + line if line.strip() else line for line in lib_code.split("\n")) if lib_code else ""
+
+        return CUBIN_GEN_CODE_TEMPLATE.format(
+            lib_code=indented_lib_code,
+            wrapper_args=", ".join(inner_args + ["stream: CUstream"]),
+            tma_init_code=tma_init_code,
+            kernel_launches=kernel_launches,
+            fake_tensor_code=fake_tensor_code,
+            fake_tma_tensor_code=fake_tma_tensor_code,
+            compile_args=", ".join(fake_inner_args),
+            primary_name=kernel_metadata_list[0]["function_name"],
+        )
+
+    def _generate_python_wrapper(
+        self,
+        function_args: list[dict],
+        cubin_gen_code: str,
+        cubin_gen_params: str,
+    ) -> str:
+        """Generate Python wrapper code."""
+        # Build function parameters
+        call_func_params = ", ".join(arg["name"] for arg in function_args)
+        launcher_call_args = ", ".join(arg["name"] for arg in function_args)
+
+        return PYTHON_HOST_FUNC_TEMPLATE.format(
+            cubin_gen_params=cubin_gen_params,
+            cubin_gen_code=cubin_gen_code,
+            launcher_lib_name=self._launcher_lib_name,
+            call_func_params=call_func_params,
+            cubin_gen_call_args=cubin_gen_params,
+            arg_prep_code="",
+            launcher_call_args=launcher_call_args,
+        )
+
+    # =========================================================================
+    # TMA Descriptor Processing
+    # =========================================================================
+
+    def _process_tma_descriptors(self, desc_names: list[str]) -> tuple[list[str], dict[str, tuple[str, int]]]:
+        """Process TMA descriptors and return tensor args and mapping.
+
+        Returns:
+            Tuple of (tensor_args, tensor_arg_map)
+        """
+        if not hasattr(self, "tma_desc_info") or not desc_names:
+            return [], {}
+
+        tensor_args = []
+        tensor_arg_map = {}
+
+        for desc_name in desc_names:
+            info = self.tma_desc_info[desc_name]
+            # Extract the base buffer variable name (must be a Var, not arbitrary expression)
+            global_addr = info["globalAddress"]
+            if not isinstance(global_addr, tvm.tir.Var):
+                raise ValueError(f"TMA globalAddress must be a buffer Var, got {type(global_addr)}: {global_addr}")
+            tensor_name = global_addr.name
+
+            if tensor_name not in tensor_args:
+                tensor_args.append(tensor_name)
+                tensor_arg_map[desc_name] = (tensor_name, len(tensor_args) - 1)
+            else:
+                tensor_arg_map[desc_name] = (tensor_name, tensor_args.index(tensor_name))
+
+        return tensor_args, tensor_arg_map
+
+    def generate_tma_descriptor_args(
+        self,
+        desc_name_map: dict[str, str],
+        desc_name_var_map: dict[str, tvm.tir.Var],
+        tma_desc_code_map: dict[str, str],
+    ) -> list[str]:
+        """Generate TMA descriptor information for C++ code generation.
+
+        Returns:
+            List of descriptor variable names in the order they were processed.
+        """
+        if self.tma_descriptor_args is None:
+            return []
+
+        if not hasattr(self, "tma_desc_info"):
+            self.tma_desc_info = {}
+
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        desc_names_ordered = []
+
+        for params in parsed_params:
+            handle_name = params.handle_name
+
+            if handle_name in tma_desc_code_map:
+                continue
+
+            desc_var = desc_name_var_map[handle_name]
+            args = self.tma_descriptor_args[desc_var]
+            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
+            tensor_rank = int(tensor_rank)
+
+            global_dim = remaining_args[:tensor_rank]
+            global_stride = remaining_args[tensor_rank : 2 * tensor_rank]
+
+            if not params.is_img2col:
+                box_dim = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+                element_strides = remaining_args[3 * tensor_rank : 4 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": False,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "box_dim": box_dim,
+                    "element_strides": element_strides,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+            else:
+                element_strides = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": True,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "element_strides": element_strides,
+                    "lower_corner": params.lower_corner,
+                    "upper_corner": params.upper_corner,
+                    "smem_box_channel": params.smem_box_channel,
+                    "smem_box_pixel": params.smem_box_pixel,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+
+            tma_desc_code_map[handle_name] = ""
+            desc_names_ordered.append(handle_name)
+
+        return desc_names_ordered
+
+    # =========================================================================
+    # Main Entry Points
+    # =========================================================================
+
+    def create_dispatch_func(self, code, function_informations):
+        """Create dispatch function - always use C++ launcher."""
+        return self.create_dispatch_func_cpp_launcher(code, function_informations)
+
+    def create_dispatch_func_cpp_launcher(self, code, function_informations):
+        """Create dispatch function using C++ launcher."""
+        function_args, buffer_args = self._collect_function_args()
+
+        # Process each kernel and collect metadata
+        kernel_metadata = []
+        all_desc_names_union = []
+        all_tma_tensors_union = []
+
+        for function_name, function_info in function_informations.items():
+            declaration = extract_python_func_declaration(code, function_name)
+            desc_name_map: dict[str, str] = {}
+            desc_name_var_map: dict[str, tvm.tir.Var] = {}
+            call_args = self._extract_func_call_args(
+                declaration,
+                function_args,
+                function_info["function_params"],
+                desc_name_map,
+                desc_name_var_map,
+            )
+
+            tma_desc_code_map = {}
+            desc_names = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map, tma_desc_code_map)
+
+            tma_tensor_args, _ = self._process_tma_descriptors(desc_names)
+
+            kernel_metadata.append(
+                {
+                    "function_name": function_name,
+                    "function_info": function_info,
+                    "call_args": call_args,
+                    "desc_names": desc_names,
+                    "tma_tensor_args": tma_tensor_args,
+                    "desc_name_map": desc_name_map,
+                }
+            )
+
+            for desc in desc_names:
+                if desc not in all_desc_names_union:
+                    all_desc_names_union.append(desc)
+            for t in tma_tensor_args:
+                if t not in all_tma_tensors_union:
+                    all_tma_tensors_union.append(t)
+
+        # Process all TMA descriptors
+        all_tma_tensors, tensor_arg_map = self._process_tma_descriptors(all_desc_names_union)
+
+        # Generate C++ launcher
+        launcher_cpp_code = self._generate_cpp_launcher(
+            kernel_metadata, function_args, all_tma_tensors, all_desc_names_union, tensor_arg_map
+        )
+
+        self.launcher_cpp_code = launcher_cpp_code
+        # Use a deterministic name so that:
+        # 1) the generated kernel.py can always locate the launcher in the same directory
+        # 2) KernelCache can store it under a stable filename
+        self._launcher_lib_name = "launcher_lib.so"
+        self.launcher_lib_name = self._launcher_lib_name
+
+        # Generate cubin generation code (includes lib_code with @cute.kernel definitions)
+        cubin_gen_code = self._generate_cubin_gen_code(
+            kernel_metadata, buffer_args, all_desc_names_union, lib_code=getattr(self, "lib_code", "")
+        )
+
+        # Generate Python wrapper
+        buffer_names = [arg["name"] for arg in function_args if arg["type"] == "buffer"]
+        # Cubin generation may reference scalar args (e.g., dynamic symbols like m/n/k)
+        # inside `kernel_wrapper` and `cute.compile(...)`. They must be visible in
+        # `_generate_cubin_if_needed(...)` scope, so include them in its signature.
+        scalar_names = [arg["name"] for arg in function_args if arg["type"] != "buffer"]
+        cubin_gen_params = ", ".join(buffer_names + scalar_names)
+
+        python_wrapper = self._generate_python_wrapper(function_args, cubin_gen_code, cubin_gen_params)
+
+        return python_wrapper
+
+    def get_launcher_cpp_code(self) -> str:
+        """Get the generated C++ launcher code."""
+        return getattr(self, "launcher_cpp_code", "")
+
+    def update_lib_code(self, code: str):
+        """Update the library code with the given code string."""
+        self.lib_code = code
+
+        function_informations = {}
+        for function_name in self.function_names:
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index 7857872cf3152513641edace1ca121b57860311a..c456e4dbaa38f6bcd341198f27ca56364ae1fb68 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -1,6 +1,6 @@
 """The profiler and convert to torch utils"""
-from __future__ import annotations
 
+from __future__ import annotations
 import ctypes
 import logging
 import torch
@@ -49,9 +49,9 @@ class CythonKernelAdapter(BaseKernelAdapter):
     ir_module: tvm.IRModule | None = None
     # The global source code of the kernel -> global means the source code of the kernel
     # that is not wrapped by the wrapper code
-    kernel_global_source: str | None = None
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
     lib: ctypes.CDLL | None = None  # Compiled library handle
-    wrapped_source: str | None = None  # Generated C++ wrapper code
     # Maps symbolic variables to their corresponding buffer and shape indices
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
     # Maps pointer arguments to their corresponding (buffer_index, shape_dimension)
@@ -71,17 +71,19 @@ class CythonKernelAdapter(BaseKernelAdapter):
     # Pass configs for the compiler
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -93,7 +95,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
         """
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -122,16 +124,16 @@ class CythonKernelAdapter(BaseKernelAdapter):
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.wrapped_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
+        self.host_kernel_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
 
-        self.lib_generator.update_lib_code(self.wrapped_source)
+        self.lib_generator.update_lib_code(self.host_kernel_source)
         self.lib_generator.compile_lib()
         self.lib = self.lib_generator.load_lib()
 
         self.lib.get_last_error.restype = ctypes.c_char_p
         result = self.lib.init()
         if result != 0:
-            error_msg = self.lib.get_last_error().decode('utf-8')
+            error_msg = self.lib.get_last_error().decode("utf-8")
             error_msg += f"\n{self.lib_code}"
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
@@ -146,21 +148,24 @@ class CythonKernelAdapter(BaseKernelAdapter):
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
-        adapter.wrapped_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
         adapter.pass_configs = pass_configs
 
         if isinstance(func_or_mod, tir.PrimFunc):
@@ -190,11 +195,10 @@ class CythonKernelAdapter(BaseKernelAdapter):
         adapter.lib.get_last_error.restype = ctypes.c_char_p
         result = adapter.lib.init()
         if result != 0:
-            error_msg = adapter.lib.get_last_error().decode('utf-8')
+            error_msg = adapter.lib.get_last_error().decode("utf-8")
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
-        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params,
-                                                     adapter.lib)
+        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params, adapter.lib)
         adapter.cython_wrapper.set_dynamic_symbolic_map(adapter.dynamic_symbolic_map)
         adapter.cython_wrapper.set_buffer_dtype_map(adapter.buffer_dtype_map)
         adapter.cython_wrapper.set_static_shape_map(adapter.static_shape_map)
@@ -221,15 +225,13 @@ class CythonKernelAdapter(BaseKernelAdapter):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -259,14 +261,13 @@ class CythonKernelAdapter(BaseKernelAdapter):
         params = func.params
         ptr_map = {}
         for i, param in enumerate(params):
-            if param.dtype == 'handle':
+            if param.dtype == "handle":
                 ptr_map[i] = param.name
         return ptr_map
 
-    def _process_static_buffer_infos(self) -> \
-            tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  list[tuple[tir.Var]]]:
+    def _process_static_buffer_infos(
+        self,
+    ) -> tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]], dict[tir.Var, tuple[int, list[tuple[int, int]]]], list[tuple[tir.Var]]]:
         """Extract information about static shapes from the TIR function.
 
         Maps buffer variables to their corresponding static shapes.
@@ -332,9 +333,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
 
         Converts PyTorch tensor pointers to C void pointers for ctypes interface.
         """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
+        ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args]
         ctypes_args.append(ctypes.c_void_p(stream))
         self.lib.call(*ctypes_args)
 
@@ -349,9 +348,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
                 skip_tensor_validation: Whether to skip tensor attributes validation which
                 includes shape, dtype, device, etc.
             """
-            return self.cython_wrapper.forward([*args],
-                                               stream=stream,
-                                               skip_tensor_validation=skip_tensor_validation)
+            return self.cython_wrapper.forward([*args], stream=stream, skip_tensor_validation=skip_tensor_validation)
 
         return lambda_forward
 
@@ -383,7 +380,8 @@ class CythonKernelAdapter(BaseKernelAdapter):
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
         if kernel_only:
-            return self.kernel_global_source
+            return self.device_kernel_source
         else:
-            assert self.wrapped_source is not None, "Wrapped source is not available"
-            return self.wrapped_source
+            # Wrapper only has host kernel source
+            assert self.host_kernel_source is not None, "Wrapped source is not available"
+            return self.host_kernel_source
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index f17bfffc00d958cc4a95b56982369cb4088610ee..dc462c627f3fe8c3b512df7e2e119cd64a057685 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -32,7 +32,8 @@ cdef class CythonKernelWrapper:
         self.params = params
         self.lib = lib
         # Convert TVM types to native Python types during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         # Convert TVM shape arrays to native Python lists
         self.param_shapes = []
         self.get_current_device = torch.cuda.current_device
@@ -267,9 +268,9 @@ cdef class CythonKernelWrapper:
         # Add dynamic dimension values to kernel arguments
         for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
             if ref_id == 0:
-                call_args.append(tensor_list[buffer_idx].shape[shape_idx])
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].shape[shape_idx]))
             else:
-                call_args.append(tensor_list[buffer_idx].stride(shape_idx))
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].stride(shape_idx)))
 
         # Add CUDA stream to kernel arguments
         call_args.append(ctypes.c_void_p(stream))
diff --git a/tilelang/jit/adapter/dlpack.py b/tilelang/jit/adapter/dlpack.py
deleted file mode 100644
index 9fa767f04b96906d2f8bc1025437abe9d655ba3b..0000000000000000000000000000000000000000
--- a/tilelang/jit/adapter/dlpack.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""The profiler and convert to torch utils"""
-from __future__ import annotations
-
-import torch
-from tilelang.contrib.dlpack import to_pytorch_func
-from .base import BaseKernelAdapter
-
-
-class TorchDLPackKernelAdapter(BaseKernelAdapter):
-
-    def _convert_torch_func(self) -> callable:
-        torch_func = to_pytorch_func(self.mod)
-
-        def func(*ins: list[torch.Tensor]):
-            if len(ins) + len(self.result_idx) != len(self.params):
-                raise ValueError(
-                    f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
-                )
-            ins_idx = 0
-            args = []
-
-            # use the device of the first input tensor if available
-            device = ins[0].device if len(ins) > 0 else torch.cuda.current_device()
-
-            for i in range(len(self.params)):
-                if i in self.result_idx:
-                    dtype = self.params[i].dtype
-                    shape = list(map(int, self.params[i].shape))
-                    tensor = torch.empty(*shape, dtype=dtype, device=device)
-                else:
-                    tensor = ins[ins_idx]
-                    ins_idx += 1
-                args.append(tensor)
-
-            torch_func(*args)
-
-            if len(self.result_idx) == 1:
-                return args[self.result_idx[0]]
-            else:
-                return [args[i] for i in self.result_idx]
-
-        return func
diff --git a/tilelang/jit/adapter/libgen.py b/tilelang/jit/adapter/libgen.py
index 1e33ec0403c6bee45ef406735435b2c34e55b2c1..d67f5b403ee010f3c8b52b5d3c5c802f89e88c3a 100644
--- a/tilelang/jit/adapter/libgen.py
+++ b/tilelang/jit/adapter/libgen.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 import ctypes
-import importlib
 import logging
 import os
-import os.path as osp
 import subprocess
 import tempfile
 from typing import Any
@@ -21,14 +19,6 @@ from .utils import is_cpu_target, is_cuda_target, is_hip_target
 
 logger = logging.getLogger(__name__)
 
-try:
-    from tilelang.jit.adapter.nvrtc import is_nvrtc_available
-    if is_nvrtc_available:
-        import cuda.bindings.driver as cuda
-        from tilelang.contrib.nvrtc import compile_cuda
-except ImportError:
-    is_nvrtc_available = False
-
 
 class LibraryGenerator:
     srcpath: str | None = None
@@ -65,6 +55,7 @@ class LibraryGenerator:
         verbose = self.verbose
         if is_cuda_target(target):
             from tilelang.env import CUTLASS_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
             target_arch = get_target_arch(get_target_compute_version(target))
             libpath = src.name.replace(".cu", ".so")
@@ -75,15 +66,12 @@ class LibraryGenerator:
                     "TL_ENABLE_FAST_MATH",
                     "0.1.7",
                 )
-                enable_fast_math = not self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH,
-                                                             True)
+                enable_fast_math = not self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH, True)
             else:
                 enable_fast_math = self.pass_configs.get(PassConfigKey.TL_ENABLE_FAST_MATH, False)
 
-            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL,
-                                                      None)
-            verbose_ptxas_output = self.pass_configs.get(
-                PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
+            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+            verbose_ptxas_output = self.pass_configs.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
 
             command = [
                 get_nvcc_compiler(),
@@ -112,6 +100,7 @@ class LibraryGenerator:
 
         elif is_hip_target(target):
             from tilelang.env import COMPOSABLE_KERNEL_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
             rocm_path = find_rocm_path()
@@ -129,6 +118,7 @@ class LibraryGenerator:
             ]
         elif is_cpu_target(target):
             from tilelang.contrib.cc import get_cplus_compiler
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
 
@@ -144,9 +134,7 @@ class LibraryGenerator:
         ]
 
         if self.compile_flags:
-            command += [
-                item for flag in self.compile_flags for item in flag.split() if item not in command
-            ]
+            command += [item for flag in self.compile_flags for item in flag.split() if item not in command]
 
         command += ["-o", libpath]
 
@@ -161,8 +149,7 @@ class LibraryGenerator:
             raise RuntimeError(f"Compile kernel failed because of {e}") from e
 
         if ret.returncode != 0:
-            raise RuntimeError(f"Compilation Failed! {command}"
-                               f"\n {self.lib_code}")
+            raise RuntimeError(f"Compilation Failed! {command}\n {self.lib_code}")
 
         self.srcpath = src.name
         self.libpath = libpath
@@ -183,95 +170,3 @@ class LibraryGenerator:
 
     def set_src_path(self, srcpath):
         self.srcpath = srcpath
-
-
-class PyLibraryGenerator(LibraryGenerator):
-    host_func: str | None = None
-    culib = None
-    pymodule = None
-
-    def __init__(self, target: Target, verbose: bool = False):
-        if not is_nvrtc_available:
-            raise ImportError("cuda-python is not available, nvrtc backend cannot be used. "
-                              "Please install cuda-python via `pip install cuda-python` "
-                              "if you want to use the nvrtc backend.")
-        super().__init__(target, verbose)
-
-    @staticmethod
-    def import_from_file(module_name, file_path):
-        spec = importlib.util.spec_from_file_location(module_name, file_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        return module
-
-    def update_host_func(self, host_func: str):
-        self.host_func = host_func
-
-    def load_lib(self, lib_path: str | None = None):
-        if lib_path is None:
-            lib_path = self.libpath
-
-        pypath = lib_path.replace(".cubin", ".py")
-        self.pymodule = self.import_from_file("kernel", pypath)
-
-        # Ensure the context is valid
-        ctx = cuda.cuCtxGetCurrent()[1]
-        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
-            import torch
-            torch.cuda.synchronize()
-
-        result, self.culib = cuda.cuLibraryLoadFromFile(
-            bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
-        assert result == cuda.CUresult.CUDA_SUCCESS, f"Failed to load library: {lib_path}"
-
-    def compile_lib(self, timeout: float = None):
-        target = self.target
-        verbose = self.verbose
-        if is_cuda_target(target):
-            from tilelang.env import (CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH)
-            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
-            libpath = src.name.replace(".cu", ".cubin")
-
-            project_root = osp.join(osp.dirname(__file__), "..", "..")
-            if CUTLASS_INCLUDE_DIR is None:
-                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
-            else:
-                cutlass_path = CUTLASS_INCLUDE_DIR
-
-            if TILELANG_TEMPLATE_PATH is None:
-                tl_template_path = osp.abspath(osp.join(project_root, "src"))
-            else:
-                tl_template_path = TILELANG_TEMPLATE_PATH
-
-            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
-
-            options = [f"-I{tl_template_path}", f"-I{cutlass_path}", f"-I{cuda_home}/include"]
-            if self.compile_flags:
-                options += [
-                    item for flag in self.compile_flags for item in flag.split()
-                    if item not in options
-                ]
-
-            cubin_bytes = compile_cuda(
-                self.lib_code, target_format="cubin", options=options, verbose=verbose)
-            with open(libpath, "wb") as f:
-                f.write(cubin_bytes)
-
-            src.write(self.lib_code)
-            src.flush()
-
-            self.srcpath = src.name
-            self.libpath = libpath
-
-            pypath = src.name.replace(".cu", ".py")
-            with open(pypath, "w") as f:
-                f.write(self.host_func)
-        else:
-            raise ValueError(f"Unsupported target: {target}")
-
-    def __del__(self):
-        if self.culib:
-            result = cuda.cuLibraryUnload(self.culib)[0]
-            if result != cuda.CUresult.CUDA_SUCCESS:
-                logger.warning(f"Failed to unload library: {self.libpath}")
-            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/__init__.py b/tilelang/jit/adapter/nvrtc/__init__.py
index c9068fafd60633a8a6ba9cbe43aa9bbf61952d5a..c8abe8d7789faff0ebcebb2e306aaaba436c4a5f 100644
--- a/tilelang/jit/adapter/nvrtc/__init__.py
+++ b/tilelang/jit/adapter/nvrtc/__init__.py
@@ -5,19 +5,22 @@ This module provides runtime compilation support using NVIDIA's NVRTC API.
 
 import logging
 
-__all__ = ['NVRTCKernelAdapter', 'is_nvrtc_available', 'check_nvrtc_available']
+__all__ = ["NVRTCKernelAdapter", "TLNVRTCSourceWrapper", "NVRTCLibraryGenerator", "is_nvrtc_available", "check_nvrtc_available"]
 
 logger = logging.getLogger(__name__)
 
 # Check if cuda-python is available
 is_nvrtc_available = False
-NVRTC_UNAVAILABLE_MESSAGE = ("cuda-python is not available, NVRTC backend cannot be used. "
-                             "Please install cuda-python via `pip install cuda-python` "
-                             "if you want to use the NVRTC backend.")
+NVRTC_UNAVAILABLE_MESSAGE = (
+    "cuda-python is not available, NVRTC backend cannot be used. "
+    "Please install cuda-python via `pip install cuda-python` "
+    "if you want to use the NVRTC backend."
+)
 
 try:
     import cuda.bindings.driver as cuda  # noqa: F401
     import cuda.bindings.nvrtc as nvrtc  # noqa: F401
+
     is_nvrtc_available = True
 except ImportError as e:
     logger.debug(f"cuda-python import failed: {e}")
@@ -37,7 +40,9 @@ def check_nvrtc_available():
 
 # Conditionally import the adapter
 if is_nvrtc_available:
-    from .adapter import NVRTCKernelAdapter  # noqa: F401
+    from .adapter import NVRTCKernelAdapter
+    from .wrapper import TLNVRTCSourceWrapper
+    from .libgen import NVRTCLibraryGenerator
 else:
     # Provide a dummy class that raises error on instantiation
     class NVRTCKernelAdapter:
@@ -45,3 +50,19 @@ else:
 
         def __init__(self, *args, **kwargs):
             raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+        @classmethod
+        def from_database(cls, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class TLNVRTCSourceWrapper:
+        """Dummy TLNVRTCSourceWrapper that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class NVRTCLibraryGenerator:
+        """Dummy NVRTCLibraryGenerator that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index d6723a03102387ea76817ebe20efb51d2176203e..083c8f215d1cd0f3bd7a36c9b20a6963fe09a531 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -9,12 +9,13 @@ from tvm.target import Target
 from tilelang import tvm as tvm
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter.wrapper import TLPyWrapper
-from tilelang.jit.adapter.libgen import PyLibraryGenerator
 from tilelang.utils.language import retrieve_func_from_module
 from tilelang.utils.target import determine_target
 from tilelang.jit.adapter.base import BaseKernelAdapter
 from tilelang.jit.adapter.nvrtc import is_nvrtc_available, check_nvrtc_available
 
+from .libgen import NVRTCLibraryGenerator
+
 logger = logging.getLogger(__name__)
 
 # Import cuda bindings if available
@@ -26,23 +27,24 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
     pymodule = None
     kernels = {}
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
-
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         check_nvrtc_available()
 
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -50,7 +52,8 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
             self.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         self.param_shapes = []
         for param in params:
             native_shape = []
@@ -73,10 +76,12 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.host_func, self.function_names = self.wrapper.wrap(kernel_global_source)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
 
-        self.lib_generator = PyLibraryGenerator(self.target, self.verbose)
-        self.lib_generator.update_lib_code(self.kernel_global_source)
+        self.lib_generator = NVRTCLibraryGenerator(self.target, self.verbose)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
         self.lib_generator.update_host_func(self.host_func)
         self.lib_generator.assign_compile_flags(compile_flags)
         self.lib_generator.compile_lib()
@@ -91,20 +96,24 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[KernelParam],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -112,7 +121,8 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
             adapter.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        adapter.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
         adapter.param_shapes = []
         for param in params:
             native_shape = []
@@ -130,7 +140,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
 
         adapter.target = Target.canon_target(determine_target(target))
         adapter.verbose = verbose
-        adapter.lib_generator = PyLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator = NVRTCLibraryGenerator(adapter.target, adapter.verbose)
         adapter.lib_generator.assign_compile_flags(compile_flags)
         adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
         adapter.pymodule = adapter.lib_generator.pymodule
@@ -166,7 +176,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
                     dynamic_symbolic_map[shape] = (i, j)
         return dynamic_symbolic_map
 
-    def get_kernel_source(self) -> str | None:
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
         """Get the CUDA kernel source code.
 
         Returns
@@ -174,11 +184,13 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         Optional[str]
             The kernel source code, or None if not available
         """
-        return self.kernel_global_source
+        if kernel_only:
+            return self.device_kernel_source
+        else:
+            return self.host_func
 
     def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
-        """Low-level function to call the compiled CUDA kernel.
-        """
+        """Low-level function to call the compiled CUDA kernel."""
         return self.pymodule.call(self.kernels, *args, stream=stream)
 
     def _wrap_forward_from_prebuild_lib(self, *ins: list[torch.Tensor], stream: int | None = None):
diff --git a/tilelang/jit/adapter/nvrtc/libgen.py b/tilelang/jit/adapter/nvrtc/libgen.py
new file mode 100644
index 0000000000000000000000000000000000000000..406cc44d97f8eee513c8c427961abf908289a665
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/libgen.py
@@ -0,0 +1,233 @@
+"""NVRTC Library Generator for TileLang.
+
+Compiles CUDA kernels at runtime using NVRTC and manages resulting binaries.
+
+Why NVRTC instead of nvcc:
+- No offline compilation step, enables true JIT workflows
+- Works without CUDA toolkit installed (only requires driver)
+- Allows kernel specialization based on runtime parameters
+
+Key responsibilities:
+- Compile CUDA source to cubin using NVRTC API
+- Generate accompanying Python launcher code
+- Load compiled cubin and extract kernel handles
+- Manage library lifecycle (load/unload)
+"""
+
+from __future__ import annotations
+import importlib
+import logging
+import os.path as osp
+import platform
+import tempfile
+from types import ModuleType
+
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cuda_target
+from tilelang.jit.adapter.nvrtc import is_nvrtc_available, NVRTC_UNAVAILABLE_MESSAGE
+
+logger = logging.getLogger(__name__)
+
+if is_nvrtc_available:
+    import cuda.bindings.driver as cuda
+    from tilelang.contrib.nvrtc import compile_cuda
+else:
+    raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+
+class NVRTCLibraryGenerator(LibraryGenerator):
+    """Runtime compiler and loader for NVRTC-compiled CUDA kernels.
+
+    Lifecycle:
+        1. compile_lib(): CUDA source → cubin + Python launcher
+        2. load_lib(): cubin → loaded library + kernel handles
+        3. pymodule.call(): Execute kernels via Python launcher
+        4. __del__: Cleanup (unload library)
+
+    Why three files (cu, cubin, py):
+        - .cu: Source for debugging, kept in temp directory
+        - .cubin: Compiled binary, loaded by CUDA driver
+        - .py: Launch code, imported as Python module
+
+    Attributes:
+        host_func: Generated Python launch code (from wrapper)
+        culib: CUDA library handle (CUlibrary)
+        pymodule: Imported Python module containing call() function
+    """
+
+    host_func: str | None = None
+    culib: cuda.CUlibrary | None = None
+    pymodule: ModuleType | None = None
+    pypath: str | None = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        """Initialize NVRTC library generator.
+
+        Args:
+            target: Compilation target (must be CUDA)
+            verbose: Enable verbose compilation output
+        """
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        """Dynamically import Python module from file path.
+
+        Standard importlib pattern for loading modules outside sys.path.
+        Used to import generated .py launcher code from temp directory.
+
+        Args:
+            module_name: Name to assign to imported module
+            file_path: Absolute path to .py file
+
+        Returns:
+            Imported module object
+        """
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Failed to import module from file: {file_path}")
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        """Store generated Python launch code for later file write.
+
+        Called by adapter after wrapper generates the launch code.
+        This is the bridge between code generation and file output.
+
+        Args:
+            host_func: Python source code containing call() function
+        """
+        self.host_func = host_func
+
+    def load_lib(self, lib_path: str | None = None):
+        """Load compiled cubin and Python launcher into memory.
+
+        Why two loads:
+            1. Import Python module for launch logic
+            2. Load cubin via CUDA Driver API for kernel handles
+
+        Context synchronization: CUDA context must be current before loading.
+        If not, use torch.cuda.synchronize() to establish context.
+
+        Args:
+            lib_path: Path to .cubin file (optional, uses self.libpath if None)
+
+        Side effects:
+            - Sets self.pymodule to imported Python module
+            - Sets self.culib to CUDA library handle
+        """
+        if lib_path is None:
+            lib_path = self.libpath
+        else:
+            self.libpath = lib_path
+
+        self.pypath = lib_path.replace(".cubin", ".py")
+        self.pymodule = self.import_from_file("kernel", self.pypath)
+
+        # Ensure the context is valid
+        ctx = cuda.cuCtxGetCurrent()[1]
+        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
+            import torch
+
+            torch.cuda.synchronize()
+
+        result, self.culib = cuda.cuLibraryLoadFromFile(bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
+        if result != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"Failed to load library: {lib_path}, error: {result}")
+
+    def compile_lib(self, timeout: float | None = None):
+        """Compile CUDA source to cubin using NVRTC and write output files.
+
+        Output artifacts (all in temp directory):
+            - .cu: Source code (for debugging)
+            - .cubin: Compiled binary (for execution)
+            - .py: Python launcher (for calling kernels)
+
+        Include paths setup:
+            - TileLang templates: kernel primitives and utilities
+            - CUTLASS: optimized GEMM/tensor ops
+            - CUDA headers: driver/runtime APIs
+
+        Why architecture detection:
+            ARM64 servers (SBSA) have different header paths than x86_64.
+
+        Args:
+            timeout: Compilation timeout in seconds (currently unsupported by NVRTC compiler)
+
+        Side effects:
+            - Writes .cu, .cubin, .py files to temp directory
+            - Sets self.srcpath, self.libpath, self.pypath
+        """
+        target = self.target
+        verbose = self.verbose
+        if is_cuda_target(target):
+            from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
+
+            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)
+            libpath = src.name.replace(".cu", ".cubin")
+
+            project_root = osp.join(osp.dirname(__file__), "..", "..")
+            if CUTLASS_INCLUDE_DIR is None:
+                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
+            else:
+                cutlass_path = CUTLASS_INCLUDE_DIR
+
+            if TILELANG_TEMPLATE_PATH is None:
+                tl_template_path = osp.abspath(osp.join(project_root, "src"))
+            else:
+                tl_template_path = TILELANG_TEMPLATE_PATH
+
+            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
+            __CUDACC_VER_MAJOR__ = cuda.CUDA_VERSION // 1000
+
+            # Determine target architecture
+            machine = platform.machine()
+            target_arch = "sbsa-linux" if machine in ("aarch64", "arm64") else "x86_64-linux"
+
+            options = [
+                f"-I{tl_template_path}",
+                f"-I{cutlass_path}",
+                f"-I{cuda_home}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include/cccl",
+                f"-D__CUDACC_VER_MAJOR__={__CUDACC_VER_MAJOR__}",
+            ]
+            if self.compile_flags:
+                options += [item for flag in self.compile_flags for item in flag.split() if item not in options]
+
+            cubin_bytes = compile_cuda(self.lib_code, target_format="cubin", options=options, verbose=verbose)
+            with open(libpath, "wb") as f:
+                f.write(cubin_bytes)
+
+            src.write(self.lib_code)
+            src.flush()
+
+            self.srcpath = src.name
+            self.libpath = libpath
+            self.pypath = src.name.replace(".cu", ".py")
+            if self.host_func is None:
+                raise RuntimeError("Host function is not set, please call update_host_func() first.")
+            with open(self.pypath, "w") as f:
+                f.write(self.host_func)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def __del__(self):
+        """Cleanup: unload CUDA library when object is destroyed.
+
+        Critical for resource management - CUDA libraries consume GPU memory.
+        Failure to unload is logged but not raised (destructor can't fail).
+
+        Why explicit unload:
+            Python GC doesn't know about GPU resources, must release manually.
+        """
+        if self.culib:
+            result = cuda.cuLibraryUnload(self.culib)[0]
+            if result != cuda.CUresult.CUDA_SUCCESS:
+                logger.warning(f"Failed to unload library: {self.libpath}")
+            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..2316823ec61760133d7fcd01dff8dd670e805aa4
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -0,0 +1,581 @@
+"""NVRTC Source Wrapper for TileLang.
+
+Generates Python runtime code for launching CUDA kernels compiled via NVRTC.
+
+Why this exists:
+- NVRTC compiles kernels at runtime, needs Python launch code (not C++)
+- TMA descriptors must be initialized once per unique buffer, not per kernel
+- L2 cache policies require explicit CUDA Driver API setup/teardown
+
+Key design:
+- Two-pass generation: collect all descriptors first, then generate launches
+- Dict-based deduplication ensures TMA descriptors created only once
+- Generates pure Python using cuda.bindings.driver for zero C++ dependency
+"""
+
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import match_declare_kernel, pythonic_expr, parse_function_call_args, parse_tma_descriptor_args
+
+PREDEF_HOST_FUNC_PY = """
+from cuda.bindings.driver import (
+    CUtensorMapDataType,
+    CUtensorMapInterleave,
+    CUtensorMapSwizzle,
+    CUtensorMapL2promotion,
+    CUtensorMapFloatOOBfill,
+    cuTensorMapEncodeTiled,
+    cuTensorMapEncodeIm2col,
+    CUresult,
+    cuKernelSetAttribute,
+    CUfunction_attribute,
+    CUdevice,
+    CUlaunchConfig,
+    cuLaunchKernelEx,
+    cuuint64_t,
+    cuuint32_t,
+    CUkernel,
+)
+import ctypes
+
+_function_names = {}
+
+def call({}):
+    {}
+"""
+
+TMA_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_boxDim = [{6}]
+    {0}_elementStrides = [{7}]
+    {0}_interleave = CUtensorMapInterleave({8})
+    {0}_swizzle = CUtensorMapSwizzle({9})
+    {0}_l2Promotion = CUtensorMapL2promotion({10})
+    {0}_oobFill = CUtensorMapFloatOOBfill({11})
+
+    res, {0} = cuTensorMapEncodeTiled(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_boxDim,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+TMA_IM2COL_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_elementStrides = [{6}]
+    {0}_lowerCorner = [{7}]
+    {0}_upperCorner = [{8}]
+    {0}_channelsPerPixel = {9}
+    {0}_pixelsPerColumn = {10}
+    {0}_interleave = CUtensorMapInterleave({11})
+    {0}_swizzle = CUtensorMapSwizzle({12})
+    {0}_l2Promotion = CUtensorMapL2promotion({13})
+    {0}_oobFill = CUtensorMapFloatOOBfill({14})
+
+    res, {0} = cuTensorMapEncodeIm2col(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_lowerCorner,
+        {0}_upperCorner,
+        {0}_channelsPerPixel,
+        {0}_pixelsPerColumn,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_CREATE_HANDLE_PY = """
+    from cuda.bindings.driver import (
+        CUstreamAttrValue,
+        CUstreamAttrID,
+        CUlimit,
+        CUaccessProperty,
+        cuCtxGetLimit,
+        cuCtxSetLimit,
+        cuStreamSetAttribute,
+        cuCtxResetPersistingL2Cache,
+    )
+
+    stream_attribute = CUstreamAttrValue()
+    res, init_persisting_l2_cache_size = cuCtxGetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE)
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to get L2 cache size limit: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_INIT_FUNC_PY = """
+    stream_attribute.accessPolicyWindow.hitRatio = {1}
+    stream_attribute.accessPolicyWindow.hitProp = CUaccessProperty.CU_ACCESS_PROPERTY_PERSISTING
+    stream_attribute.accessPolicyWindow.missProp = CUaccessProperty.CU_ACCESS_PROPERTY_STREAMING
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, {2})[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set L2 cache size limit: {{res}}")
+
+    stream_attribute.accessPolicyWindow.base_ptr = {0}.data_ptr()
+    stream_attribute.accessPolicyWindow.num_bytes = {2}
+
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set stream L2 access policy: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_RESET_HANDLE_PY = """
+    stream_attribute.accessPolicyWindow.num_bytes = 0
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset stream L2 access policy: {{res}}")
+
+    res = cuCtxResetPersistingL2Cache()[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset L2 cache: {{res}}")
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, init_persisting_l2_cache_size)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to restore L2 cache size limit: {{res}}")
+"""
+
+KERNEL_LAUNCH_FUNC_PY = """
+    res = cuKernelSetAttribute(
+        CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        {7},
+        kernels["{0}"],
+        CUdevice({10})
+    )[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
+
+    config = CUlaunchConfig()
+    config.gridDimX = {1}
+    config.gridDimY = {2}
+    config.gridDimZ = {3}
+    config.blockDimX = {4}
+    config.blockDimY = {5}
+    config.blockDimZ = {6}
+    config.sharedMemBytes = {7}
+    config.hStream = stream
+
+    arg_values = {8}
+    arg_types = {9}
+
+    res = cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
+"""
+
+
+class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
+    """NVRTC backend wrapper: generates Python kernel launch code.
+
+    Core responsibility: transform TVM IRModule into executable Python function
+    that initializes resources (TMA descriptors, L2 cache) and launches kernels
+    via CUDA Driver API.
+
+    Data flow:
+        IRModule → collect kernel metadata → deduplicate resources →
+        generate Python code → executable function
+
+    Why Python generation instead of C++:
+        NVRTC workflow requires runtime compilation, Python is the natural host.
+        Using cuda.bindings.driver eliminates C++ wrapper complexity.
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "ctypes.c_float",
+        "float16": "ctypes.c_uint16",
+        "bfloat16": "ctypes.c_uint16",
+        "float8_e4m3": "ctypes.c_uint8",
+        "float8_e4m3fn": "ctypes.c_uint8",
+        "float8_e5m2": "ctypes.c_uint8",
+        "float64": "ctypes.c_double",
+        "int64": "ctypes.c_int64",
+        "int32": "ctypes.c_int32",
+        "uint32": "ctypes.c_uint32",
+        "bool": "ctypes.c_bool",
+        "int8": "ctypes.c_int8",
+        "uint8": "ctypes.c_uint8",
+        "int16": "ctypes.c_int16",
+        "uint16": "ctypes.c_uint16",
+        "uchar": "ctypes.c_uint8",
+    }
+
+    _generated_host_func: str | None = None
+
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
+        """Initialize NVRTC wrapper with compiled IR modules.
+
+        Args:
+            scheduled_ir_module: TVM IR after scheduling passes
+            source: Generated CUDA C++ source code
+            target: Compilation target (should be NVRTC-compatible)
+            device_mod: Device-side IR module (kernel functions)
+            host_mod: Host-side IR module (launch logic)
+            pass_configs: Optional compiler pass configurations
+        """
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string, ignoring casts.
+
+        Casts are noise in generated Python code - Python is dynamically typed.
+        """
+        return pythonic_expr(expr, self._TYPE_MAP, ignore_cast=True, floor_div_op="//")
+
+    def create_dispatch_func(self, code, function_informations):
+        """Generate Python dispatch function that launches multiple CUDA kernels.
+
+        Why two-pass design:
+            Pass 1: Collect TMA descriptors from all kernels into shared dicts
+            Pass 2: Generate code - descriptors first (deduplicated), then launches
+
+            Single-pass would create duplicate descriptors for each kernel.
+            Dict naturally deduplicates by descriptor name.
+
+        Args:
+            code: CUDA C++ source containing kernel declarations
+            function_informations: Dict mapping kernel names to metadata
+                (grid/block dims, params, shared memory size)
+
+        Returns:
+            Python source code defining a call() function that:
+            1. Initializes L2 cache policies (if needed)
+            2. Creates TMA descriptors once per unique buffer
+            3. Launches each kernel with cuLaunchKernelEx
+            4. Resets L2 cache policies (if needed)
+        """
+        # Extract the set of dynamic symbolic names used in the primary function
+        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
+
+        function_args = [{"name": "kernels", "type": "dict[str, CUkernel]"}]
+        # Collect function arguments based on primary function's parameters and buffer mappings
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": "ctypes.c_void_p",
+                    }
+                )
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
+            else:
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
+        # Add dynamic symbols as integer arguments
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
+            if dyn_sym not in [arg["name"] for arg in function_args]:
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
+
+        function_args.append(self.get_stream_type())
+
+        # Format the function arguments for declaration
+        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
+
+        # Check if any function needs L2 Persistent Map
+        has_l2_persistent_map = False
+        for function_name, _ in function_informations.items():
+            if function_name in self.l2_persistent_map:
+                has_l2_persistent_map = True
+                break
+
+        desc_name_map: dict[str, str] = {}
+        desc_name_var_map: dict[str, tvm.tir.Var] = {}
+        device_index = 0
+        kernel_launch_code = """"""
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_CREATE_HANDLE_PY
+
+        # First pass: collect all TMA descriptors from all kernels to avoid duplication
+        kernel_info_list = []
+        for function_name, function_info in function_informations.items():
+            block_info = function_info["block_info"]
+            grid_info = function_info["grid_info"]
+            dynamic_smem_buf = function_info["dynamic_smem_buf"]
+            function_params = function_info["function_params"]
+
+            # Find the location of the global kernel function in the code
+            index = match_declare_kernel(code, function_name + "(")
+
+            # Analyze the function declaration to prepare for argument extraction
+            declaration = code[index:].split(";")[0]
+
+            # Identify the start of the function body to insert arguments
+            index = code.index("{", index)
+
+            # Transform function for NVRTC: returns (arg_value, arg_type) tuples
+            def transform_nvrtc_arg(name: str, arg_type: str):
+                if arg_type == "ctypes.c_void_p":
+                    return (f"{name}.data_ptr()", arg_type)
+                return (name, arg_type)
+
+            call_args = parse_function_call_args(
+                declaration, function_args, function_params, desc_name_map, desc_name_var_map, transform_nvrtc_arg
+            )
+
+            for arg_name, arg_type in call_args:
+                if arg_type == "ctypes.c_void_p":
+                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
+                    break
+
+            # Store kernel info for second pass
+            kernel_info_list.append(
+                {
+                    "function_name": function_name,
+                    "block_info": block_info,
+                    "grid_info": grid_info,
+                    "dynamic_smem_buf": dynamic_smem_buf,
+                    "call_args": call_args,
+                    "device_index": device_index,
+                }
+            )
+
+        # Generate TMA descriptor initialization code once for all kernels
+        kernel_launch_code += self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
+
+        # Second pass: generate kernel launch code for each kernel
+        for kernel_info in kernel_info_list:
+            function_name = kernel_info["function_name"]
+            block_info = kernel_info["block_info"]
+            grid_info = kernel_info["grid_info"]
+            dynamic_smem_buf = kernel_info["dynamic_smem_buf"]
+            call_args = kernel_info["call_args"]
+            device_index = kernel_info["device_index"]
+
+            arg_names = ", ".join([arg[0] for arg in call_args])
+            arg_types = ", ".join([arg[1] for arg in call_args])
+            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
+
+            # Generate L2 persistent map initialization for this function
+            init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
+            kernel_launch_code += init_l2_persistent_map
+
+            # Generate kernel launch code
+            kernel_launch_code += KERNEL_LAUNCH_FUNC_PY.format(
+                function_name,
+                self._pythonic_expr(grid_info[0]),
+                self._pythonic_expr(grid_info[1]),
+                self._pythonic_expr(grid_info[2]),
+                self._pythonic_expr(block_info[0]),
+                self._pythonic_expr(block_info[1]),
+                self._pythonic_expr(block_info[2]),
+                smem_str,
+                arg_names,
+                arg_types,
+                device_index,
+            )
+
+        # Reset L2 persistent map after all kernel execution
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE_PY
+
+        # Wrap the kernel dispatch logic in an external C function
+        host_func = PREDEF_HOST_FUNC_PY.format(repr(list(function_informations.keys())), def_args, kernel_launch_code)
+        return host_func
+
+    def generate_l2_persistent_map(self, function_name: str) -> str:
+        """Generate Python code to configure L2 cache persistence for a kernel.
+
+        L2 persistence pins frequently-accessed data in L2 cache to reduce
+        memory bandwidth. Requires explicit setup via CUDA stream attributes.
+
+        Args:
+            function_name: Kernel name to check for L2 persistence config
+
+        Returns:
+            Python code that sets stream access policy window, or empty
+            string if no L2 persistence configured for this kernel.
+        """
+        if function_name not in self.l2_persistent_map:
+            return ""
+        init_l2_persistent_map = ""
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
+            # Get persisting_l2_cache_max_size
+            from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
+            persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
+            try:
+                num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
+            except TypeError:
+                # as size_in_bytes may be a symbolic expression
+                num_bytes = persisting_l2_cache_max_size
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC_PY.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+
+        return init_l2_persistent_map
+
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+        """Generate Python code to initialize TMA descriptors.
+
+        TMA (Tensor Memory Accelerator) descriptors are opaque CUDA objects
+        that describe memory layout for async copies. Must be created on host
+        before kernel launch.
+
+        Args:
+            desc_name_map: Maps descriptor variable names to buffer names
+            desc_name_var_map: Maps descriptor names to TVM variables
+
+        Returns:
+            Python code that calls cuTensorMapEncodeTiled/Im2col for each
+            unique descriptor. Empty string if no TMA descriptors needed.
+        """
+        tma_descriptor_init = ""
+        if self.tma_descriptor_args is None:
+            return tma_descriptor_init
+
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        # Generate Python code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
+                tma_descriptor_init += TMA_DESC_INIT_FUNC_PY.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.box_dim)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+            else:
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC_PY.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    ", ".join(params.lower_corner),
+                    ", ".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+
+        return tma_descriptor_init
+
+    def update_lib_code(self, code: str):
+        """Update library code and generate host dispatch function.
+
+        Entry point for code generation. Walks the host IR to extract kernel
+        call sites, matches them with device kernels, then generates Python
+        dispatch code via create_dispatch_func().
+
+        Args:
+            code: CUDA C++ source code containing compiled kernels
+
+        Returns:
+            The same code string (stored in self.lib_code). Side effect:
+            sets self.host_func to generated Python dispatcher.
+        """
+        # Update the library code with the given code string
+        self.lib_code = code
+
+        # Organize function information for code generation
+        function_informations = {}
+        for function_name in self.function_names:
+            # Do not update function with dispatch host function
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] | None = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        # Create the host function wrapper for the CUDA kernel
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
+
+    def get_stream_type(self) -> dict[str, str]:
+        """Return stream parameter spec for Python signature.
+
+        NVRTC backend uses raw int for stream handle (not cudaStream_t pointer).
+        Default to 0 (NULL stream) for convenience.
+        """
+        return {"name": "stream=0", "type": "int"}
diff --git a/tilelang/jit/adapter/torch/__init__.py b/tilelang/jit/adapter/torch/__init__.py
index 2390e3e7c210cdc0ee92307524f168cdf2de36ba..f688993d0e563f1b14a7c4ecbe91ce4a9bd344b3 100644
--- a/tilelang/jit/adapter/torch/__init__.py
+++ b/tilelang/jit/adapter/torch/__init__.py
@@ -1,3 +1,3 @@
 from .metal import MetalKernelAdapter
 
-__all__ = ['MetalKernelAdapter']
+__all__ = ["MetalKernelAdapter"]
diff --git a/tilelang/jit/adapter/torch/metal.py b/tilelang/jit/adapter/torch/metal.py
index 0b1bc009813ea61eab5fa904412f17ce7291ede5..4690cf59bda7cd1907c130d5a8d1446466e097b5 100644
--- a/tilelang/jit/adapter/torch/metal.py
+++ b/tilelang/jit/adapter/torch/metal.py
@@ -12,7 +12,6 @@ from tilelang.engine.param import KernelParam
 
 
 class MetalKernelAdapter(BaseKernelAdapter):
-
     def __init__(
         self,
         params: list[KernelParam],
@@ -28,10 +27,10 @@ class MetalKernelAdapter(BaseKernelAdapter):
     ):
         self.kernel_global_source = kernel_global_source
         if isinstance(func_or_mod, tir.PrimFunc):
-            func_name = func_or_mod.attrs['global_symbol']
+            func_name = func_or_mod.attrs["global_symbol"]
         else:
             func_name = func_or_mod.__name__
-        self.kernel_name = func_name + '_kernel'
+        self.kernel_name = func_name + "_kernel"
         self.verbose = verbose
 
         self.block_info = [1, 1, 1]
@@ -39,7 +38,7 @@ class MetalKernelAdapter(BaseKernelAdapter):
 
         for var, func in device_mod.functions.items():
             assert var.name_hint == self.kernel_name
-            thread_extent = func.attrs['thread_extent']
+            thread_extent = func.attrs["thread_extent"]
             for tag, extent in thread_extent.items():
                 if "threadIdx" in tag:
                     self.block_info["xyz".index(tag[-1])] = extent
@@ -47,7 +46,7 @@ class MetalKernelAdapter(BaseKernelAdapter):
                     self.grid_info["xyz".index(tag[-1])] = extent
             break
         else:
-            raise AssertionError(f'no kernel with name {func_name}')
+            raise AssertionError(f"no kernel with name {func_name}")
 
         # print(self.block_info, self.grid_info)
         super().__init__(func_or_mod, result_idx=result_idx, params=params)
@@ -55,15 +54,12 @@ class MetalKernelAdapter(BaseKernelAdapter):
     _kernel = None
 
     def _convert_torch_func(self) -> Callable:
-
         if self._kernel is None:
-
             _kernel = getattr(torch.mps.compile_shader(self.kernel_global_source), self.kernel_name)
             _threads = [x * y for (x, y) in zip(self.block_info, self.grid_info)]
 
             @wraps(_kernel)
             def launcher(*args: torch.Tensor):
-
                 return _kernel(
                     *args,
                     threads=_threads,
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdba92c210858eafb31ecf25b96956da8e8f501b
--- /dev/null
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -0,0 +1,314 @@
+"""Utilities to adapt TVM FFI kernels to Torch tensors.
+
+This adapter intentionally captures PyTorch's current CUDA stream and device
+via light-weight callables so that, when the wrapped function is invoked,
+the execution observes the same stream context as the active Torch code.
+On non-CUDA builds, the stream/device fall back to 0/CPU semantics.
+"""
+
+from __future__ import annotations
+
+from typing import Callable, Any
+
+import torch
+from tilelang import tvm
+from tvm import runtime, tir
+from tvm.target import Target
+from tvm.relax import TensorType
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.engine.param import KernelParam
+
+
+class TVMFFIKernelAdapter(BaseKernelAdapter):
+    """Adapter that runs a TVM runtime.Executable with Torch tensors.
+
+    Notes
+    - We capture the "current" PyTorch CUDA stream/device as thunks (callables)
+      rather than materializing them at construction time. This ensures the
+      actual stream/device is read just-in-time when the function runs, matching
+      the user's current Torch context (e.g., after a stream guard/switch).
+    - The stream pointer returned is a raw CUDA stream handle compatible with
+      TVM's device API; on CPU or when CUDA is unavailable, we return 0.
+    """
+
+    # Class attributes to store compiled kernel information
+    target: str | Target = "cuda"
+    ir_module: tvm.IRModule | None = None
+    # The global source code of the kernel -> global means the source code of the kernel
+    # that is not wrapped by the wrapper code
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
+    executable: tvm.runtime.Executable | None = None
+    # Pass configs for the compiler
+    pass_configs: dict[str, Any] | None = None
+    # host_mod
+    host_mod: tvm.IRModule | None = None
+    # device_mod
+    device_mod: tvm.IRModule | None = None
+    # rt_mod
+    rt_mod: tvm.runtime.Module | None = None
+    # Maps symbolic variables to their corresponding buffer and shape indices
+    dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] | None = None
+
+    # Stream/device functors are inherited from BaseKernelAdapter
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        rt_mod: tvm.runtime.Module | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        """Initialize the adapter with the given TIR function or module.
+
+        Args:
+            params: List of tensor types for inputs/outputs
+            result_idx: Indices of output tensors
+            target: Target platform (e.g., 'cuda')
+            func_or_mod: TIR function or module to be compiled
+            verbose: Enable verbose logging
+        """
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        self.target = Target.canon_target(determine_target(target))
+
+        self.host_mod = host_mod
+        self.device_mod = device_mod
+        self.rt_mod = rt_mod
+        self.verbose = verbose
+        self.pass_configs = pass_configs
+        self.compile_flags = compile_flags
+        self.dynamic_symbolic_map = self._process_dynamic_symbolic()
+
+        self._post_init()
+
+    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
+        """Extract information about dynamic shapes from the TIR function.
+
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
+        for runtime shape resolution.
+        id represents shape or stride, 0 represents shape, 1 represents stride
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map = {}
+        for i, param in enumerate(params):
+            if isinstance(param, tir.Var) and (param not in dynamic_symbolic_map):
+                dynamic_symbolic_map[param] = (2, i, -1)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, shape in enumerate(buffer.shape):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
+                        dynamic_symbolic_map[shape] = (0, i, j)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, stride in enumerate(buffer.strides):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
+                        dynamic_symbolic_map[stride] = (1, i, j)
+        return dynamic_symbolic_map
+
+    def _convert_torch_func(self) -> Callable[..., Any]:
+        # Capture thunks that reflect Torch's current stream and device.
+        # These are evaluated at call time to align TVM execution with the
+        # caller's active PyTorch stream/device.
+        # current_stream_functor = self.get_current_stream_functor()
+        current_device_functor = self.get_current_device_functor()
+
+        # Convert TVM types to native Python types during initialization
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        param_dtypes = [param.torch_dtype() for param in self.params]
+        # Convert TVM shape arrays to native Python lists
+        param_shapes = []
+
+        for param in self.params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    native_shape.append(dim)  # Keep tir.Var for dynamic dimensions
+                else:
+                    native_shape.append(dim)
+            param_shapes.append(native_shape)
+
+        if self.executable is None:
+            self.executable = runtime.Executable(self.rt_mod)
+
+        dynamic_symbolic_map = self._process_dynamic_symbolic()
+        executable = self.executable
+
+        # Prepare helpers for friendly dtype error messages
+        prim_func = self.prim_func
+        buffer_map = prim_func.buffer_map
+        params = prim_func.params
+        # Expected dtype string per parameter index (for buffers only)
+        expected_dtype_strs: list[str | None] = []
+        # Track whether each param is a buffer (has dtype) vs scalar
+        is_buffer_param: list[bool] = []
+        for p in params:
+            if p in buffer_map:
+                expected_dtype_strs.append(str(buffer_map[p].dtype))
+                is_buffer_param.append(True)
+            else:
+                expected_dtype_strs.append(None)
+                is_buffer_param.append(False)
+
+        # Map torch dtype to TVM-style dtype string
+        def torch_dtype_to_tvm_str(dtype: torch.dtype) -> str:
+            try:
+                import torch as _torch
+            except Exception:  # pragma: no cover
+                # Fallback, though torch should always be available here
+                return str(dtype)
+            fp8_e4m3fn = getattr(_torch, "float8_e4m3fn", None)
+            fp8_e4m3fnuz = getattr(_torch, "float8_e4m3fnuz", None)
+            fp8_e5m2 = getattr(_torch, "float8_e5m2", None)
+            fp8_e5m2fnuz = getattr(_torch, "float8_e5m2fnuz", None)
+            if fp8_e4m3fn is not None and dtype == fp8_e4m3fn:
+                return "float8_e4m3"
+            if fp8_e4m3fnuz is not None and dtype == fp8_e4m3fnuz:
+                return "float8_e4m3fnuz"
+            if fp8_e5m2 is not None and dtype == fp8_e5m2:
+                return "float8_e5m2"
+            if fp8_e5m2fnuz is not None and dtype == fp8_e5m2fnuz:
+                return "float8_e5m2"
+            # Strip torch. prefix for readability
+            s = str(dtype)
+            return s[6:] if s.startswith("torch.") else s
+
+        def func(*inputs: torch.Tensor | Any):
+            # Validate input count strictly
+            expected_inputs = len(self.params) - len(self.result_idx)
+            if len(inputs) != expected_inputs:
+                raise ValueError(f"Kernel expected {expected_inputs} inputs, but {len(inputs)} are provided.")
+
+            # Resolve the device used for outputs. Prefer the first tensor input's device
+            # if available, otherwise use PyTorch's current device.
+            out_device: torch.device | None = None
+
+            # Stitch the full positional argument list expected by the TVM executable
+            ins_idx: int = 0
+            tensor_list: list[torch.Tensor] = []
+
+            # Prepare input and output tensors
+            for i in range(len(self.params)):
+                if i in self.result_idx:
+                    dtype = param_dtypes[i]
+                    shape = []
+                    # Now working with native Python list, no FFI calls needed
+                    for s in param_shapes[i]:
+                        if isinstance(s, tir.Var):
+                            for key in dynamic_symbolic_map:
+                                if str(s) == str(key):
+                                    ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[key]
+                                    if ref_id == 2:
+                                        shape.append(inputs[ref_tensor_idx])
+                                    elif ref_id == 0:
+                                        shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
+                                    elif ref_id == 1:
+                                        shape.append(tensor_list[ref_tensor_idx].stride()[ref_shape_idx])
+                        else:  # Already converted to Python int during initialization
+                            shape.append(s)
+
+                    if out_device is None:
+                        out_device = current_device_functor()
+
+                    if len(shape) == 0:
+                        param_name = self.params[i].name if hasattr(self.params[i], "name") else f"parameter_{i}"
+                        raise ValueError(
+                            f"Cannot create output tensor (name={param_name}) - 0-dimensional tensors are not supported. "
+                            f"Expected shape: {shape}"
+                        )
+                    tensor = torch.empty(*shape, dtype=dtype, device=out_device)
+                else:
+                    tensor = inputs[ins_idx]
+                    ins_idx += 1
+                tensor_list.append(tensor)
+
+            executable(*tensor_list)
+
+            # Return outputs in the requested form
+            if len(self.result_idx) == 1:
+                return tensor_list[self.result_idx[0]]
+            return [tensor_list[i] for i in self.result_idx]
+
+        return func
+
+    @classmethod
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+        adapter.wrapped_source = device_kernel_source + "\n\n" + host_kernel_source
+        adapter.pass_configs = pass_configs
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        target = determine_target(target, return_object=True)
+        adapter.target = Target.canon_target(determine_target(target))
+
+        adapter.verbose = verbose
+        adapter.executable = runtime.load_module(kernel_lib_path)
+        adapter._post_init()
+        return adapter
+
+    def get_host_source(self):
+        """Returns the source code of the host module."""
+        if self.host_kernel_source is not None:
+            return self.host_kernel_source
+        return self.rt_mod.inspect_source()
+
+    def get_device_source(self):
+        """Returns the source code of the device module."""
+        if self.device_kernel_source is not None:
+            return self.device_kernel_source
+        return self.rt_mod.imports[0].inspect_source()
+
+    def get_kernel_source(self, kernel_only: bool = False):
+        """Returns the source code of the compiled kernel."""
+        if kernel_only:
+            return self.get_device_source()
+        else:
+            return self.get_device_source() + "\n\n" + self.get_host_source()
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/adapter/utils.py b/tilelang/jit/adapter/utils.py
index efc965e1b96a2656e5756febbf6c9c26dbed4db8..d43adf840aafec7bc2f0b2ae2e01a3aa84f272a1 100644
--- a/tilelang/jit/adapter/utils.py
+++ b/tilelang/jit/adapter/utils.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import Literal
+from typing import Literal, Callable, Any
 from tilelang import tvm as tvm
 from tvm import IRModule, tir
 from tvm.target import Target
@@ -38,6 +38,53 @@ def match_declare_kernel(source: str, annotation: str = "__global__") -> int:
     raise ValueError("No global kernel found in the source code")
 
 
+def match_declare_kernel_cutedsl(source: str, annotation: str = "@cute.kernel") -> int:
+    # Match decorator followed by function definition across lines
+    # \s+ allows any whitespace including newlines between decorator and def
+    pattern = r"@cute\.kernel\s+def\s+(\w+)"
+    matched = re.search(pattern, source, re.MULTILINE)
+    if matched:
+        # Find the position of the opening parenthesis after the function name
+        # matched.start(1) gives position of function name
+        func_name_pos = matched.start(1)
+        # Find the '(' after function name
+        paren_pos = source.find("(", func_name_pos)
+        if paren_pos != -1:
+            return paren_pos
+    raise ValueError("No global kernel found in the source code")
+
+
+def extract_python_func_declaration(source: str, func_name: str) -> str:
+    """Extract the full Python function declaration from decorator to colon.
+
+    Args:
+        source: Source code containing the function
+        func_name: Name of the function to extract (can include '(' suffix)
+
+    Returns:
+        The function declaration from 'def' to ':', including parameters
+
+    Example:
+        For code:
+            @cute.kernel
+            def kernel(arg1: cute.Tensor, arg2: int):
+                ...
+        Returns: "def kernel(arg1: cute.Tensor, arg2: int)"
+    """
+    # Remove '(' suffix if present
+    if func_name.endswith("("):
+        func_name = func_name[:-1]
+
+    # Match from def to the closing ) followed by :
+    # This handles multi-line function signatures
+    pattern = rf"def\s+{re.escape(func_name)}\s*\([^)]*\)"
+    matched = re.search(pattern, source, re.DOTALL)
+    if matched:
+        return matched.group(0)
+
+    raise ValueError(f"No function declaration found for {func_name}")
+
+
 def match_declare_kernel_cpu(source: str, annotation: str = "int32_t") -> int:
     pattern = r"int32_t\s+\w+"
     for line in source.split("\n"):
@@ -64,13 +111,16 @@ def is_metal_target(target: Target) -> bool:
     return target.kind.name == "metal"
 
 
+def is_cutedsl_target(target: Target) -> bool:
+    return target.kind.name == "cuda" and "cutedsl" in target.keys
+
+
 def get_annotated_mod(
     func_or_mod: tir.PrimFunc | tvm.IRModule,
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     model_type: Literal["device", "host", "all"] = "all",
 ) -> IRModule | tuple[IRModule, IRModule]:
-
     # Validate model_type early
     if model_type not in {"device", "host", "all"}:
         raise ValueError(f"Invalid model type: {model_type}")
@@ -95,25 +145,28 @@ def get_annotated_mod(
 
     # Define dispatch dictionary for different model types
     dispatch = {
-        "device":
-            lambda m: tir.transform.Filter(_is_device_call)(m),
-        "host":
-            lambda m: tir.transform.Filter(_is_host_call)(m),
-        "all":
-            lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)
-                       (m)),
+        "device": lambda m: tir.transform.Filter(_is_device_call)(m),
+        "host": lambda m: tir.transform.Filter(_is_host_call)(m),
+        "all": lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)(m)),
     }
 
     return dispatch[model_type](mod)
 
 
-def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None) -> str:
+def pythonic_expr(
+    expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None, ignore_cast: bool = False, floor_div_op: str = "/"
+) -> str:
     """
     Converts a TVM PrimExpr into a Python-style string, correctly handling operator precedence.
 
     Args:
         expr: The TVM PrimExpr to convert.
-
+        dtype_map: A dictionary mapping data types to their string representations.
+        ignore_cast: Whether to ignore the cast operator and return the string representation of the value without the cast.
+        floor_div_op: Operator to use for tvm.tir.FloorDiv. Default '/' preserves prior
+                      behavior (suitable for generating C/C++ expressions). For generating
+                      Python code where integer division is required (e.g. grid/block),
+                      pass '//' explicitly.
     Returns:
         A string representation of the expression.
     """
@@ -158,18 +211,33 @@ def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = Non
         elif isinstance(node, tvm.tir.Cast):
             # C-style cast has high precedence
             value_str, _ = node_to_result_map[node.value]
-            if dtype_map is None:
-                s = f"({node.dtype}){value_str}"
+            if ignore_cast:
+                s = value_str
             else:
-                s = f"({dtype_map[node.dtype]}){value_str}"
+                type_str = node.dtype if dtype_map is None else dtype_map[node.dtype]
+                s = f"({type_str}){value_str}"
             p = PRECEDENCE.get(type(node), ATOMIC_PRECEDENCE)
         elif isinstance(
-                node,
-            (tvm.tir.Mul, tvm.tir.FloorDiv, tvm.tir.Add, tvm.tir.Sub, tvm.tir.FloorMod, tvm.tir.LT,
-             tvm.tir.LE, tvm.tir.GT, tvm.tir.GE, tvm.tir.EQ, tvm.tir.NE, tvm.tir.And, tvm.tir.Or)):
+            node,
+            (
+                tvm.tir.Mul,
+                tvm.tir.FloorDiv,
+                tvm.tir.Add,
+                tvm.tir.Sub,
+                tvm.tir.FloorMod,
+                tvm.tir.LT,
+                tvm.tir.LE,
+                tvm.tir.GT,
+                tvm.tir.GE,
+                tvm.tir.EQ,
+                tvm.tir.NE,
+                tvm.tir.And,
+                tvm.tir.Or,
+            ),
+        ):
             op_map = {
                 tvm.tir.Mul: "*",
-                tvm.tir.FloorDiv: "/",
+                tvm.tir.FloorDiv: floor_div_op,
                 tvm.tir.Add: "+",
                 tvm.tir.Sub: "-",
                 tvm.tir.FloorMod: "%",
@@ -216,3 +284,211 @@ def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = Non
     tvm.tir.stmt_functor.post_order_visit(expr, _visitor)
 
     return next(iter(node_to_result_map[expr]), "")
+
+
+def maybe_desc_name(name: str, matches: list[str], i: int, desc_name_map: dict[str, str] | None = None) -> bool:
+    """
+    Check if a parameter name corresponds to a TMA descriptor.
+
+    Args:
+        name: The parameter name to check.
+        matches: List of all matched parameter names.
+        i: Index of the current match.
+        desc_name_map: Optional mapping to store descriptor name relationships.
+
+    Returns:
+        True if the parameter is a TMA descriptor.
+    """
+    match = matches[i]
+    if not (match == name + "_desc" or match.startswith(name + "_desc_")):
+        return False
+    desc_decls = []
+    if desc_name_map is not None:
+        desc_name_map[match] = name
+    if i > 0:
+        desc_decls.append(matches[i - 1])
+    if i < len(matches) - 1:
+        desc_decls.append(matches[i + 1])
+    return any([decl == "CUtensorMap" for decl in desc_decls])
+
+
+def parse_function_call_args(
+    declaration: str,
+    function_args: list[dict[str, str]],
+    function_params: list[Any],
+    desc_name_map: dict[str, str] | None = None,
+    desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    transform_arg: Callable[[str, str], Any] | None = None,
+) -> list[Any]:
+    """
+    Parse function call arguments from a kernel declaration.
+
+    Args:
+        declaration: The kernel function declaration string.
+        function_args: List of function argument specifications.
+        function_params: List of function parameters from TVM IR.
+        desc_name_map: Optional mapping for descriptor names.
+        desc_name_var_map: Optional mapping from descriptor names to TVM variables.
+        transform_arg: Optional function to transform each argument (name, type) -> result.
+
+    Returns:
+        List of parsed call arguments.
+    """
+    pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
+    matches = re.findall(pattern, declaration)
+    call_args = []
+
+    for i, match in enumerate(matches):
+        for arg in function_args:
+            if arg["name"] == match:
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, arg["type"]))
+                else:
+                    call_args.append(match)
+            elif maybe_desc_name(arg["name"], matches, i, desc_name_map):
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, "None"))
+                else:
+                    call_args.append(match)
+                if desc_name_var_map is not None and function_params is not None:
+                    assert len(call_args) <= len(function_params), f"Too many arguments: {len(call_args)} > {len(function_params)}"
+                    desc_name_var_map[match] = function_params[len(call_args) - 1]
+
+    return call_args
+
+
+class TMADescriptorParams:
+    """Parsed TMA descriptor parameters."""
+
+    def __init__(self, handle_name: str, dtype: str, tensor_rank: int, global_address: Any, is_img2col: bool = False):
+        self.handle_name = handle_name
+        self.dtype = dtype
+        self.tensor_rank = tensor_rank
+        self.global_address = global_address
+        self.is_img2col = is_img2col
+
+        # Common fields
+        self.global_dim: list[str] = []
+        self.global_stride: list[str] = []
+        self.element_strides: list[str] = []
+        self.interleave: str = ""
+        self.swizzle: str = ""
+        self.l2_promotion: str = ""
+        self.oob_fill: str = ""
+
+        # Tiled-specific fields
+        self.box_dim: list[str] = []
+
+        # Im2col-specific fields
+        self.lower_corner: list[str] = []
+        self.upper_corner: list[str] = []
+        self.smem_box_channel: str = ""
+        self.smem_box_pixel: str = ""
+
+
+def parse_tma_descriptor_args(
+    tma_descriptor_args: dict[tvm.tir.Var, list[Any]],
+    desc_name_map: dict[str, str],
+    desc_name_var_map: dict[str, tvm.tir.Var],
+    pythonic_expr_func: Callable[[Any], str],
+) -> list[TMADescriptorParams]:
+    """
+    Parse TMA descriptor arguments into structured parameters.
+
+    Args:
+        tma_descriptor_args: Dictionary mapping TMA descriptor variables to their arguments.
+        desc_name_map: Mapping from descriptor handles to parameter names.
+        desc_name_var_map: Mapping from descriptor handles to TVM variables.
+        pythonic_expr_func: Function to convert TVM expressions to strings.
+
+    Returns:
+        List of parsed TMA descriptor parameters.
+    """
+    if not tma_descriptor_args:
+        return []
+
+    results = []
+
+    for handle_name, _ in desc_name_map.items():
+        assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
+        desc_var = desc_name_var_map[handle_name]
+
+        assert desc_var in tma_descriptor_args, f"TMA descriptor {desc_var} not found in {tma_descriptor_args}"
+        args = tma_descriptor_args[desc_var]
+
+        # Skip __tvm_tensormap_create_tiled and second element (like CUDA version)
+        if len(args) < 3:
+            raise ValueError(f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
+
+        tma_create_str, _, dtype, tensor_rank, global_address, *remaining_args = args
+
+        is_img2col = tma_create_str.value == "__tvm_tensormap_create_im2col"
+
+        # Convert basic fields
+        dtype = pythonic_expr_func(dtype)
+        tensor_rank = int(pythonic_expr_func(tensor_rank))
+
+        # Validate tensor_rank
+        if not isinstance(tensor_rank, int) or tensor_rank <= 0:
+            raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
+
+        params = TMADescriptorParams(handle_name, dtype, tensor_rank, global_address, is_img2col)
+
+        if not is_img2col:
+            # Tiled mode
+            expected_args_len = 4 * tensor_rank + 4
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.box_dim = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank]]
+
+            # Extract remaining parameters
+            try:
+                interleave, swizzle, l2_promotion, oob_fill = remaining_args[4 * tensor_rank : 4 * tensor_rank + 4]
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError("Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)") from e
+        else:
+            # Im2col mode
+            expected_args_len = 5 * tensor_rank + 2
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.lower_corner = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank - 2]]
+            params.upper_corner = [pythonic_expr_func(i) for i in remaining_args[4 * tensor_rank - 2 : 5 * tensor_rank - 4]]
+
+            # Extract remaining parameters
+            try:
+                smem_box_pixel, smem_box_channel, interleave, swizzle, l2_promotion, oob_fill = remaining_args[
+                    5 * tensor_rank - 4 : 5 * tensor_rank + 2
+                ]
+                params.smem_box_pixel = pythonic_expr_func(smem_box_pixel)
+                params.smem_box_channel = pythonic_expr_func(smem_box_channel)
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to unpack the final 6 TMA parameters "
+                    "(smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
+                ) from e
+
+        results.append(params)
+
+    return results
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index cdd0d5c7a98d3388b88798e4c5161ddd4036d248..d83d0ccc0689375cecf0ded98f20f0e895fe35a3 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -4,8 +4,20 @@ from tilelang import tvm as tvm
 from typing import Any
 from tvm import IRModule
 from tvm.target import Target
-from .utils import (is_metal_target, match_declare_kernel, match_declare_kernel_cpu, is_cuda_target,
-                    is_hip_target, is_cpu_target, get_annotated_mod, pythonic_expr)
+
+from .utils import (
+    is_metal_target,
+    is_cutedsl_target,
+    match_declare_kernel,
+    match_declare_kernel_cpu,
+    is_cuda_target,
+    is_hip_target,
+    is_cpu_target,
+    get_annotated_mod,
+    pythonic_expr,
+    parse_function_call_args,
+    parse_tma_descriptor_args,
+)
 import re
 import logging
 import textwrap
@@ -49,16 +61,6 @@ extern "C" int call({}) {{
 }}
 """
 
-PREDEF_HOST_FUNC_PY = """
-import cuda.bindings.driver
-import ctypes
-
-_function_names = {}
-
-def call({}):
-    {}
-"""
-
 L2_PERSISTENT_MAP_CREATE_HANDLE = """
 \tcudaStreamAttrValue stream_attribute;
 \tsize_t init_persisting_l2_cache_size;
@@ -136,68 +138,8 @@ TMA_IM2COL_DESC_INIT_FUNC = """
 \t}}
 """
 
-TMA_DESC_INIT_FUNC_PY = """
-\t{0}_type = cuda.bindings.driver.CUtensorMapDataType({1})
-\t{0}_tensorRank = {2}
-\t{0}_globalAddress = {3}.data_ptr()
-\t{0}_globalDim = [{4}]
-\t{0}_globalStride = [{5}][1:]
-\t{0}_boxDim = [{6}]
-\t{0}_elementStrides = [{7}]
-\t{0}_interleave = cuda.bindings.driver.CUtensorMapInterleave({8})
-\t{0}_swizzle = cuda.bindings.driver.CUtensorMapSwizzle({9})
-\t{0}_l2Promotion = cuda.bindings.driver.CUtensorMapL2promotion({10})
-\t{0}_oobFill = cuda.bindings.driver.CUtensorMapFloatOOBfill({11})
-
-\tres, {0} = cuda.bindings.driver.cuTensorMapEncodeTiled(
-\t\t{0}_type,
-\t\t{0}_tensorRank,
-\t\t{0}_globalAddress,
-\t\t{0}_globalDim,
-\t\t{0}_globalStride,
-\t\t{0}_boxDim,
-\t\t{0}_elementStrides,
-\t\t{0}_interleave,
-\t\t{0}_swizzle,
-\t\t{0}_l2Promotion,
-\t\t{0}_oobFill,
-\t)
-
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
-"""
-
-KERNEL_LAUNCH_FUNC_PY = """
-\tres = cuda.bindings.driver.cuKernelSetAttribute(
-\t\tcuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-\t\t{7},
-\t\tkernels["{0}"],
-\t\tcuda.bindings.driver.CUdevice({10})
-\t)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
-
-\tconfig = cuda.bindings.driver.CUlaunchConfig()
-\tconfig.gridDimX = {1}
-\tconfig.gridDimY = {2}
-\tconfig.gridDimZ = {3}
-\tconfig.blockDimX = {4}
-\tconfig.blockDimY = {5}
-\tconfig.blockDimZ = {6}
-\tconfig.sharedMemBytes = {7}
-\tconfig.hStream = stream
-
-\targ_values = {8}
-\targ_types = {9}
-
-\tres = cuda.bindings.driver.cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
-"""
-
 
 class BaseWrapper(ABC):
-
     @abstractmethod
     def wrap(self, *args, **kwargs):
         raise NotImplementedError
@@ -212,6 +154,7 @@ class TLCUDASourceWrapper:
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
         "float8_e4m3": "fp8_e4_t",
+        "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
         "float64": "double",
         "int64": "int64_t",
@@ -230,13 +173,15 @@ class TLCUDASourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -255,7 +200,9 @@ class TLCUDASourceWrapper:
         self.lib_code: str | None = self.update_lib_code(source)
 
     def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
-        return pythonic_expr(expr, self._TYPE_MAP)
+        # This wrapper generates C/CUDA source. C/C++ integer division uses '/',
+        # and '//' is not a valid operator in C/C++.
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="/")
 
     def _lookup_type(self, dtype: str | Any) -> str:
         key = dtype if isinstance(dtype, str) else str(dtype)
@@ -278,60 +225,26 @@ class TLCUDASourceWrapper:
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": self._lookup_type(buffer.dtype) + "* __restrict__",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": self._lookup_type(buffer.dtype) + "* __restrict__",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
                 function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "int"})
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
 
         function_args.append(self.get_stream_type())
 
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
-        def func_call_args(s,
-                           function_args,
-                           function_params,
-                           desc_name_map: dict[str, str] | None = None,
-                           desc_name_var_map: dict[str, tvm.tir.Var] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(match)
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append(match)
-                        assert len(call_args) <= len(
-                            function_params
-                        ), f"Function {function_name} has {len(function_params)} parameters, but {len(call_args)} arguments"
-                        desc_name_var_map[match] = function_params[len(call_args) - 1]
-
-            return call_args
-
         has_l2_persistent_map = False
         for function_name, _ in function_informations.items():
             if function_name in self.l2_persistent_map:
@@ -358,38 +271,40 @@ class TLCUDASourceWrapper:
             # Identify the start of the function body to insert arguments
             index = code.index("{", index)
 
-            block_str = f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
-            grid_str = f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            block_str = (
+                f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
+            )
+            grid_str = (
+                f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            )
             smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
             init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
             kernel_launch_code += init_l2_persistent_map
 
             if self.use_cooperative_groups[function_name]:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 args_array = [f"(void*)&{arg}" for arg in args_list]
                 call_args = f"\tvoid* {function_name}_args[] = {{{', '.join(args_array)}}};\n"
                 kernel_launch_code += call_args
                 # Using cudaLaunchCooperativeKernel to launch the kernel
                 kernel_launch_code += "\tTILELANG_CHECK(cudaLaunchCooperativeKernel((void*){}, {}, {}, {}, {}, stream));\n".format(
-                    function_name, grid_str, block_str, function_name + "_args", smem_str)
+                    function_name, grid_str, block_str, function_name + "_args", smem_str
+                )
             else:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 call_args = ", ".join(args_list)
                 kernel_launch_code += f"\t{function_name}<<<{grid_str}, {block_str}, {smem_str}, stream>>>({call_args});\n"
-                kernel_launch_code += f"\tTILELANG_CHECK_LAST_ERROR(\"{function_name}\");\n"
+                kernel_launch_code += f'\tTILELANG_CHECK_LAST_ERROR("{function_name}");\n'
             if has_l2_persistent_map:
                 kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE
 
-        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map,
-                                                                     desc_name_var_map)
+        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
         kernel_launch_code = init_tma_descriptor_args + kernel_launch_code
 
         # Wrap the kernel dispatch logic in an external C function
@@ -400,123 +315,65 @@ class TLCUDASourceWrapper:
         if function_name not in self.l2_persistent_map:
             return ""
         init_l2_persistent_map = ""
-        for buffer_name, (hit_ratio,
-                          size_in_bytes) in self.l2_persistent_map[function_name].items():
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
             # get persisting_l2_cache_max_size
             from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
             persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
             try:
                 num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
             except Exception:
                 # as size_in_bytes maybe a symbolic expression
                 num_bytes = persisting_l2_cache_max_size
-            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(
-                buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
 
         return init_l2_persistent_map
 
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str],
-                                     desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
-        tma_descripter_init = ""
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+        tma_descriptor_init = ""
         if self.tma_descriptor_args is None:
-            return tma_descripter_init
-        for handle_name, _ in desc_name_map.items():
-            assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
-            desc_var = desc_name_var_map[handle_name]
-
-            assert desc_var in self.tma_descriptor_args, f"TMA descriptor {desc_var} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_var]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-
-            tma_create_str, _, dtype, tensor_rank, globalAddress, *remaining_args = args
-
-            is_img2col = (tma_create_str.value == "__tvm_tensormap_create_im2col")
-            dtype = self._pythonic_expr(dtype)
-            tensor_rank = int(self._pythonic_expr(tensor_rank))
-
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            if not is_img2col:
-                # Calculate required length for remaining_args
-                expected_args_len = 4 * tensor_rank + 4  # 4 groups of tensor_rank size + 4 parameters
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                box_dim = [self._pythonic_expr(i) for i in box_dim]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-
-                # Extract remaining parameters
-                try:
-                    interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                               tensor_rank + 4]
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
-
-                tma_descripter_init += TMA_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(box_dim), ",".join(element_strides),
-                    interleave, swizzle, l2Promotion, oobFill)
+            return tma_descriptor_init
+
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        # Generate C++ code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
+                tma_descriptor_init += TMA_DESC_INIT_FUNC.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.box_dim),
+                    ",".join(params.element_strides),
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
             else:
-                # Calculate required length for remaining_args
-                expected_args_len = 5 * tensor_rank + 2
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                element_strides = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                lower_corner = remaining_args[3 * tensor_rank:4 * tensor_rank - 2]
-                upper_corner = remaining_args[4 * tensor_rank - 2:5 * tensor_rank - 4]
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-                lower_corner = [self._pythonic_expr(i) for i in lower_corner]
-                upper_corner = [self._pythonic_expr(i) for i in upper_corner]
-
-                # Extract remaining parameters
-                try:
-                    smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill = remaining_args[
-                        5 * tensor_rank - 4:5 * tensor_rank + 2]
-                    smem_box_pixel = self._pythonic_expr(smem_box_pixel)
-                    smem_box_channel = self._pythonic_expr(smem_box_channel)
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 6 TMA parameters (smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
-
-                tma_descripter_init += TMA_IM2COL_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(element_strides), ",".join(lower_corner),
-                    ",".join(upper_corner), smem_box_channel, smem_box_pixel, interleave, swizzle,
-                    l2Promotion, oobFill)
-
-        return tma_descripter_init
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.element_strides),
+                    ",".join(params.lower_corner),
+                    ",".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+
+        return tma_descriptor_init
 
     def parse_source_information(self):
         if self.device_mod is None or self.host_mod is None:
@@ -524,9 +381,8 @@ class TLCUDASourceWrapper:
                 device_mod, host_mod = get_annotated_mod(self.mod, self.target)
             self.device_mod = device_mod
             self.host_mod = host_mod
-        assert (len(self.device_mod.functions)
-                >= 1), "Device module should have at least one function."
-        assert (len(self.host_mod.functions) == 1), "Only support one function in host module."
+        assert len(self.device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(self.host_mod.functions) == 1, "Only support one function in host module."
 
         block_info_map = {}
         grid_info_map = {}
@@ -583,18 +439,20 @@ class TLCUDASourceWrapper:
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
 
-        def unique_push_back(name: str):
+        def unique_push_back(name: str, dtype: str):
             if name not in dynamic_symbolic_set:
-                dynamic_symbolic_set.append(name)
+                dynamic_symbolic_set[name] = dtype
+            else:
+                assert dtype == dynamic_symbolic_set[name]
 
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var):
-                        unique_push_back(dim.name)
+                        unique_push_back(dim.name, str(dim.dtype))
 
         # Note: In buffer definitions, any dynamic symbols appearing in strides are listed after those in the shape.
         for param in prim_func.params:
@@ -602,9 +460,9 @@ class TLCUDASourceWrapper:
                 buffer = prim_func.buffer_map[param]
                 for stride in buffer.strides:
                     if isinstance(stride, tvm.tir.Var):
-                        unique_push_back(stride.name)
+                        unique_push_back(stride.name, str(stride.dtype))
 
-        return dynamic_symbolic_set
+        return list(dynamic_symbolic_set.items())
 
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
@@ -613,8 +471,7 @@ class TLCUDASourceWrapper:
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(function_name, dynamic_smem_buf)
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
@@ -641,17 +498,14 @@ class TLCUDASourceWrapper:
             def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
                 nonlocal function_params
                 if isinstance(node, tvm.tir.Call):
-                    if not (hasattr(node, "op") and
-                            node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
                         return
                     args = node.args
                     if not args or args[0] != fn:
                         return
                     if len(args) < 1 + param_cnt:
-                        raise AssertionError(
-                            "tvm_call_packed should have at least 1 argument and match device function parameters"
-                        )
-                    function_params = args[1:1 + param_cnt]
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
 
             post_order_visit(self.host_func.body, visitor)
             assert function_params is not None, "function_params should not be None"
@@ -713,213 +567,6 @@ class TLCUDASourceWrapper:
             raise ValueError("Cannot find primary function in the module.")
 
 
-class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
-    """
-    A wrapper class for the TileLang NVRTC backend.
-    """
-
-    _TYPE_MAP = {
-        "float32": "ctypes.c_float",
-        "float16": "ctypes.c_uint16",
-        "bfloat16": "ctypes.c_uint16",
-        "float8_e4m3": "ctypes.c_uint8",
-        "float8_e4m3fn": "ctypes.c_uint8",
-        "float8_e5m2": "ctypes.c_uint8",
-        "float64": "ctypes.c_double",
-        "int64": "ctypes.c_int64",
-        "int32": "ctypes.c_int32",
-        "uint32": "ctypes.c_uint32",
-        "bool": "ctypes.c_bool",
-        "int8": "ctypes.c_int8",
-        "uint8": "ctypes.c_uint8",
-        "int16": "ctypes.c_int16",
-        "uint16": "ctypes.c_uint16",
-        "uchar": "ctypes.c_uint8",
-    }
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
-        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
-
-    def create_dispatch_func(self, code, function_informations):
-        # Extract the set of dynamic symbolic names used in the primary function
-        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
-
-        function_args = [{"name": "kernels", "type": "Dict[str, cuda.bindings.driver.CUkernel]"}]
-        # Collect function arguments based on primary function's parameters and buffer mappings
-        for param in self.prim_func.params:
-            if param in self.prim_func.buffer_map:
-                buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": "ctypes.c_void_p",
-                })
-            elif isinstance(param, tvm.tir.Var):
-                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
-            else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
-        # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "ctypes.c_int"})
-
-        function_args.append(self.get_stream_type())
-        # Format the function arguments for declaration
-        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
-
-        def func_call_args(s, function_args, desc_name_map: dict[str, str] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(
-                            (f"{match}.data_ptr()" if arg["type"] == "ctypes.c_void_p" else match,
-                             arg["type"]))
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append((match, "None"))
-            return call_args
-
-        desc_name_map: dict[str, str] = {}
-        device_index = 0
-        kernel_launch_code = """"""
-        for function_name, function_info in function_informations.items():
-            block_info = function_info["block_info"]
-            grid_info = function_info["grid_info"]
-            dynamic_smem_buf = function_info["dynamic_smem_buf"]
-
-            # Find the location of the global kernel function in the code
-            index = match_declare_kernel(code, function_name + "(")
-
-            # Analyze the function declaration to prepare for argument extraction
-            declaration = code[index:].split(";")[0]
-
-            # Identify the start of the function body to insert arguments
-            index = code.index("{", index)
-            call_args = func_call_args(declaration, function_args, desc_name_map)
-            for arg_name, arg_type in call_args:
-                if arg_type == "ctypes.c_void_p":
-                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
-                    break
-            arg_names = ", ".join([arg[0] for arg in call_args])
-            arg_types = ", ".join([arg[1] for arg in call_args])
-            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
-            kernel_launch_code += self.generate_tma_descriptor_args(
-                desc_name_map) + KERNEL_LAUNCH_FUNC_PY.format(
-                    function_name, self._pythonic_expr(grid_info[0]),
-                    self._pythonic_expr(grid_info[1]), self._pythonic_expr(grid_info[2]),
-                    self._pythonic_expr(block_info[0]), self._pythonic_expr(block_info[1]),
-                    self._pythonic_expr(
-                        block_info[2]), smem_str, arg_names, arg_types, device_index)
-
-        # Wrap the kernel dispatch logic in an external C function
-        host_func = PREDEF_HOST_FUNC_PY.format(
-            repr(list(function_informations.keys())), def_args, kernel_launch_code)
-        return host_func
-
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str]) -> str:
-        tma_descripter_init = ""
-        if self.tma_descriptor_args is None:
-            return tma_descripter_init
-
-        for handle_name, name in desc_name_map.items():
-            desc_name = name + "_desc"
-            assert desc_name in self.tma_descriptor_args, f"TMA descriptor {desc_name} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_name]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
-
-            tensor_rank = int(tensor_rank)
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            # Calculate required length for remaining_args
-            # 4 groups of tensor_rank size + 4 parameters
-            expected_args_len = 4 * tensor_rank + 4
-            if len(remaining_args) < expected_args_len:
-                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-            # Extract dimensions and strides using list slicing
-            global_dim = remaining_args[:tensor_rank]
-            global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-            box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-            element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-            global_dim = [str(i) for i in global_dim]
-            global_stride = [str(i) for i in global_stride]
-            box_dim = [str(i) for i in box_dim]
-            element_strides = [str(i) for i in element_strides]
-
-            # Extract remaining parameters
-            try:
-                interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                           tensor_rank + 4]
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                ) from e
-
-            tma_descripter_init += TMA_DESC_INIT_FUNC_PY.format(
-                handle_name, dtype, tensor_rank, globalAddress,
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_stride)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})", box_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})",
-                              element_strides)), interleave, swizzle, l2Promotion, oobFill)
-        return tma_descripter_init
-
-    def update_lib_code(self, code: str):
-        # Update the library code with the given code string
-        self.lib_code = code
-
-        # Organize function information for code generation
-        function_informations = {}
-        for function_name in self.function_names:
-            # Do not update function with dispatch host function
-            if (function_name not in self.block_info) or (function_name not in self.grid_info):
-                continue
-
-            function_informations[function_name] = {
-                "function_name": function_name,
-                "block_info": self.block_info[function_name],
-                "grid_info": self.grid_info[function_name],
-                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
-            }
-
-        # Create the host function wrapper for the CUDA kernel
-        self.host_func = self.create_dispatch_func(code, function_informations)
-        return self.lib_code
-
-    def get_stream_type(self) -> dict[str, str]:
-        return {"name": "stream=0", "type": "int"}
-
-
 class TLHIPSourceWrapper(TLCUDASourceWrapper):
     """
     A wrapper class for the TileLang HIP backend.
@@ -946,13 +593,15 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "uchar": "uint8_t",
     }
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
 
     def get_init_func(self):
@@ -962,8 +611,7 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(function_name, dynamic_smem_buf)
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
@@ -1005,13 +653,15 @@ class TLCPUSourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -1040,18 +690,19 @@ class TLCPUSourceWrapper:
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.name,
-                    "type": self._lookup_type(buffer.dtype) + "*",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.name,
+                        "type": self._lookup_type(buffer.dtype) + "*",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
                 function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            function_args.append({"name": dyn_sym, "type": "int"})
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
+            function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
@@ -1068,7 +719,6 @@ class TLCPUSourceWrapper:
         _call_str = """"""
 
         for function_name, _ in function_informations.items():
-
             # Find the location of the global kernel function in the code
             index = match_declare_kernel_cpu(code, function_name + "(")
 
@@ -1088,8 +738,8 @@ class TLCPUSourceWrapper:
     def parse_source_information(self):
         with tvm.transform.PassContext(opt_level=3, config=self.pass_configs):
             device_mod, host_mod = get_annotated_mod(self.mod, self.target)
-        assert (len(device_mod.functions) >= 1), "Device module should have at least one function."
-        assert (len(host_mod.functions) == 1), "Only support one function in host module."
+        assert len(device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(host_mod.functions) == 1, "Only support one function in host module."
 
         function_names = []
         for g_var, _ in device_mod.functions.items():
@@ -1100,14 +750,14 @@ class TLCPUSourceWrapper:
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var) and (dim.name not in dynamic_symbolic_set):
-                        dynamic_symbolic_set.append(dim.name)
-        return dynamic_symbolic_set
+                        dynamic_symbolic_set[dim.name] = str(dim.dtype)
+        return list(dynamic_symbolic_set.items())
 
     def get_cpu_init_func(self):
         # Provide init() and get_last_error() for CPU backend
@@ -1149,14 +799,15 @@ class TLCPUSourceWrapper:
 
 
 class TLMetalSourceWrapper:
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -1170,10 +821,14 @@ class TLMetalSourceWrapper:
         return self.lib_code
 
 
+# TLCuTeDSLSourceWrapper has been moved to tilelang.jit.adapter.cutedsl.wrapper
+
+
 class TLWrapper(BaseWrapper):
     """
     A wrapper class for the TileLang backend.
     """
+
     device_mod: IRModule | None = None
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
@@ -1218,26 +873,40 @@ class TLWrapper(BaseWrapper):
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
         return wrapper.lib_code
 
 
 class TLPyWrapper(TLWrapper):
-
     def __init__(self, target: Target):
         super().__init__(target)
 
-    def wrap(self, c_source: str):
+    def wrap(self, py_source: str):
         # assert self.scheduled_ir_module is not None, "Please assign optimized module first."
-        if is_cuda_target(self.target):
+        if is_cutedsl_target(self.target):
+            from tilelang.jit.adapter.cutedsl import TLCuTeDSLSourceWrapper
+
+            wrapper_class = TLCuTeDSLSourceWrapper
+        elif is_cuda_target(self.target):
+            from tilelang.jit.adapter.nvrtc import TLNVRTCSourceWrapper
+
             wrapper_class = TLNVRTCSourceWrapper
         else:
-            raise ValueError(f"Unsupported platform: {self.arch.platform}")
+            raise ValueError(f"Unsupported target for NVRTC backend: {self.target}")
         wrapper = wrapper_class(
             scheduled_ir_module=self.scheduled_ir_module,
-            source=c_source,
+            source=py_source,
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
-        return wrapper.host_func, wrapper.function_names
+            pass_configs=self.pass_configs,
+        )
+        return {
+            "host_func": getattr(wrapper, "host_func", None),
+            "function_names": getattr(wrapper, "function_names", None),
+            "tma_cpp_init_code": getattr(wrapper, "tma_cpp_init_code", None),
+            "tma_lib_name": getattr(wrapper, "tma_lib_name", None),
+            "launcher_cpp_code": getattr(wrapper, "launcher_cpp_code", None),
+            "launcher_lib_name": getattr(wrapper, "launcher_lib_name", None),
+        }
diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..db5e4a8b41d58695c7325aad7c35b0117f7f2a24
--- /dev/null
+++ b/tilelang/jit/execution_backend.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+from tvm.target import Target
+from tilelang.jit.adapter.utils import is_cutedsl_target
+from tilelang.env import env as _env
+
+# Canonical names for execution backends used internally
+_CANONICAL_MAP = {
+    "dlpack": "tvm_ffi",  # historical alias
+}
+
+
+def _canon_backend(name: str | None) -> str | None:
+    if name is None:
+        return None
+    key = str(name).lower()
+    return _CANONICAL_MAP.get(key, key)
+
+
+def _target_kind(target: Target) -> str:
+    # tvm.target.Target always has kind.name
+    return target.kind.name
+
+
+def allowed_backends_for_target(target: Target, *, include_unavailable: bool = True) -> list[str]:
+    """Return allowed execution backends for a given TVM target kind.
+
+    include_unavailable: if False, this will filter out backends that are known
+    to be unavailable at runtime (e.g., NVRTC without cuda-python installed).
+    """
+    kind = _target_kind(target)
+
+    if is_cutedsl_target(target):
+        return ["cutedsl"]
+    elif kind == "cuda":
+        allowed = ["tvm_ffi", "nvrtc", "cython", "ctypes"]
+    elif kind == "hip":
+        allowed = ["tvm_ffi", "cython", "ctypes"]
+    elif kind == "metal":
+        allowed = ["torch"]
+    elif kind == "c":  # CPU C backend
+        allowed = ["cython", "ctypes", "tvm_ffi"]
+    else:
+        # Fallback: prefer portable hosts
+        allowed = ["cython", "ctypes", "tvm_ffi"]
+
+    if not include_unavailable:
+        # Drop NVRTC if not importable
+        try:
+            from tilelang.jit.adapter.nvrtc import is_nvrtc_available  # lazy
+
+            if not is_nvrtc_available and "nvrtc" in allowed:
+                allowed = [b for b in allowed if b != "nvrtc"]
+        except Exception:
+            # Be conservative and keep nvrtc if detection itself fails
+            pass
+
+    return allowed
+
+
+def _format_options(options: Iterable[str]) -> str:
+    return ", ".join(sorted(options))
+
+
+def resolve_execution_backend(requested: str | None, target: Target) -> str:
+    """Resolve an execution backend string to a concrete backend.
+
+    - Supports the alias "dlpack" -> "tvm_ffi".
+    - Supports the sentinel "auto" which selects a sensible default per target.
+    - Validates the combination (target, backend) and raises with helpful
+      alternatives when invalid.
+    """
+    req = _canon_backend(requested)
+    allowed_all = allowed_backends_for_target(target, include_unavailable=True)
+    allowed_avail = allowed_backends_for_target(target, include_unavailable=False)
+
+    def _require_gemm_v1_for_cutedsl():
+        if not _env.use_gemm_v1():
+            raise ValueError(
+                "CuTeDSL backend requires GEMM v1. Please set environment variable TILELANG_USE_GEMM_V1=1 before importing tilelang."
+            )
+        # Fail fast with a clear error if CuTeDSL dependencies are missing or incompatible.
+        try:
+            from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available  # lazy
+
+            check_cutedsl_available()
+        except ImportError as e:
+            # Keep resolve_execution_backend's error semantics (ValueError) while
+            # preserving the actionable ImportError message.
+            raise ValueError(str(e)) from e
+
+    # Default selection for auto/None
+    if req in (None, "auto"):
+        if is_cutedsl_target(target):
+            _require_gemm_v1_for_cutedsl()
+            return "cutedsl"
+        kind = _target_kind(target)
+        if kind == "cuda":
+            choice = "tvm_ffi"
+        elif kind == "metal":
+            choice = "torch"
+        else:
+            choice = "cython"
+        # If the chosen default is not available (very rare), fall back to first available
+        if choice not in allowed_avail and allowed_avail:
+            choice = allowed_avail[0]
+        return choice
+
+    # Validate against allowed
+    if req not in allowed_all:
+        raise ValueError(
+            f"Invalid execution backend '{requested}' for target '{_target_kind(target)}'. "
+            f"Allowed: {_format_options(allowed_all)}. Tip: use execution_backend='auto'."
+        )
+
+    # Promote to availability-aware set for nicer errors (e.g., nvrtc not installed)
+    if req not in allowed_avail:
+        raise ValueError(
+            f"Execution backend '{requested}' requires extra dependencies and is not available now. "
+            f"Try one of: {_format_options(allowed_avail)}."
+        )
+
+    # CuTeDSL requires GEMM v1
+    if req == "cutedsl":
+        _require_gemm_v1_for_cutedsl()
+
+    return req
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index bb47716cea4752230ba821e83c74ef1c7b795579..a788e76ba77473fd52bff0ea6cd9d957ebe3a07f 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 from typing import Any, Callable, Generic, Literal, TypeVar
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 
-from tilelang.jit.adapter.utils import is_metal_target, is_cuda_target
+from tilelang.jit.adapter.utils import is_cutedsl_target, is_metal_target, is_cuda_target
 from tvm.target import Target
 from tvm.tir import PrimFunc
 
@@ -14,18 +15,25 @@ import tilelang
 from tilelang import tvm
 from tilelang import env
 from tilelang.engine.param import CompiledArtifact, KernelParam
-from tilelang.jit.adapter import (BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter,
-                                  NVRTCKernelAdapter, TorchDLPackKernelAdapter, MetalKernelAdapter)
+from tilelang.jit.adapter import (
+    BaseKernelAdapter,
+    CtypesKernelAdapter,
+    CythonKernelAdapter,
+    CuTeDSLKernelAdapter,
+    TVMFFIKernelAdapter,
+    MetalKernelAdapter,
+)
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
 from tilelang.contrib import nvcc as tl_nvcc
+from tilelang.transform import PassConfigKey
 import logging
 import os
 
 logger = logging.getLogger(__name__)
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
 
 
 class JITKernel(Generic[_P, _T]):
@@ -41,6 +49,7 @@ class JITKernel(Generic[_P, _T]):
     torch_function : Callable
         The compiled function that can be invoked as a PyTorch-compatible function.
     """
+
     prim_func: PrimFunc = None
     artifact: CompiledArtifact = None
     adapter: BaseKernelAdapter = None
@@ -55,7 +64,7 @@ class JITKernel(Generic[_P, _T]):
         self,
         func: PrimFunc = None,
         out_idx: list[int] | int = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         target: str | Target = "auto",
         target_host: str | Target = None,
         verbose: bool = False,
@@ -72,8 +81,8 @@ class JITKernel(Generic[_P, _T]):
             The TileLang TIR function to compile and wrap.
         out_idx : Union[List[int], int], optional
             Index(es) of the output tensors to return (default: None).
-        execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-            Execution backend to use for kernel execution (default: "cython").
+        execution_backend : Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
+            Execution backend to use for kernel execution.
         target : Union[str, Target], optional
             Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host : Union[str, Target], optional
@@ -95,25 +104,24 @@ class JITKernel(Generic[_P, _T]):
             pass_configs = {}
         self.pass_configs = pass_configs
 
-        self.compile_flags = compile_flags
+        self.compile_flags = [compile_flags] if isinstance(compile_flags, str) else compile_flags
 
         # Ensure the target is always a valid TVM Target object.
         self.target = determine_target(target, return_object=True)
 
         # Validate the execution backend.
         assert execution_backend in [
-            "dlpack",
+            "tvm_ffi",
             "ctypes",
             "cython",
             "nvrtc",
             "torch",
+            "cutedsl",
         ], f"Invalid execution backend. {execution_backend}"
         if execution_backend == "cython":
             from tilelang.contrib.cc import get_cplus_compiler
 
-            assert (
-                get_cplus_compiler() is not None
-            ), "Cython backend requires a C++ compiler, please install or use other backends."
+            assert get_cplus_compiler() is not None, "Cython backend requires a C++ compiler, please install or use other backends."
 
         if from_database:
             return
@@ -143,13 +151,14 @@ class JITKernel(Generic[_P, _T]):
     def from_database(
         cls,
         func: PrimFunc,
-        kernel_global_source: str,
+        host_kernel_source: str,
+        device_kernel_source: str,
         kernel_lib_path: str,
         params: list[KernelParam],
         target: str | Target,
         target_host: str | Target,
         out_idx: list[int] | int,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"],
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"],
         pass_configs: dict[str, Any] | None = None,
         compile_flags: list[str] | None = None,
     ):
@@ -172,7 +181,8 @@ class JITKernel(Generic[_P, _T]):
             params=params,
             result_idx=out_idx,
             target=target,
-            kernel_global_source=kernel_global_source,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
             kernel_lib_path=kernel_lib_path,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
@@ -198,8 +208,7 @@ class JITKernel(Generic[_P, _T]):
         """
         return self.torch_function(*args, **kwds)
 
-    def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
-                                    out_idx: list[int]) -> BaseKernelAdapter:
+    def _compile_and_create_adapter(self, tilelang_func: PrimFunc, out_idx: list[int]) -> BaseKernelAdapter:
         """
         Compiles the given TileLang PrimFunc using TVM and creates a kernel adapter.
 
@@ -218,31 +227,48 @@ class JITKernel(Generic[_P, _T]):
         target_host = self.target_host
 
         execution_backend = self.execution_backend
-        pass_configs = self.pass_configs
+        pass_configs = self.pass_configs or {}
 
         compile_flags = self.compile_flags
 
+        if compile_flags is not None:
+            compile_flags_cfg = pass_configs.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS)
+            pass_configs[PassConfigKey.TL_DEVICE_COMPILE_FLAGS] = (
+                compile_flags_cfg + compile_flags if compile_flags_cfg is not None else compile_flags
+            )
+
         # Compile the function with TVM, optimizing with shared memory lowering.
-        enable_host_codegen = execution_backend == "dlpack"
-        enable_device_compile = execution_backend == "dlpack"
+        enable_host_codegen = execution_backend == "tvm_ffi"
+        enable_device_compile = execution_backend == "tvm_ffi"
         with tvm.transform.PassContext(opt_level=3, config=pass_configs), self.target:
             artifact = tilelang.lower(
                 tilelang_func,
                 target=target,
                 target_host=target_host,
                 enable_host_codegen=enable_host_codegen,
-                enable_device_compile=enable_device_compile)
+                enable_device_compile=enable_device_compile,
+            )
 
         self.artifact = artifact
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            # Use TorchDLPackKernelAdapter for interoperability with PyTorch via DLPack.
+        if execution_backend == "tvm_ffi":
+            # Use TVMFFIKernelAdapter for interoperability with PyTorch via DLPack.
             # But we need to ensure that the runtime is enabled and the runtime module is not None.
-            assert tvm.runtime.enabled("llvm"), "DLPack backend requires LLVM runtime."
-            assert (artifact.rt_mod is not None), "DLPack backend requires a runtime module."
-            adapter = TorchDLPackKernelAdapter(
-                artifact.rt_mod, params=artifact.params, result_idx=out_idx)
+            assert artifact.rt_mod is not None, "tvm_ffi backend requires a runtime module."
+            adapter = TVMFFIKernelAdapter(
+                params=artifact.params,
+                result_idx=out_idx,
+                target=target,
+                func_or_mod=tilelang_func,
+                host_mod=artifact.host_mod,
+                device_mod=artifact.device_mod,
+                rt_mod=artifact.rt_mod,
+                device_kernel_source=artifact.kernel_source,
+                verbose=verbose,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         elif execution_backend == "ctypes":
             adapter = CtypesKernelAdapter(
                 params=artifact.params,
@@ -251,7 +277,7 @@ class JITKernel(Generic[_P, _T]):
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -264,12 +290,14 @@ class JITKernel(Generic[_P, _T]):
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
@@ -277,7 +305,7 @@ class JITKernel(Generic[_P, _T]):
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -296,34 +324,62 @@ class JITKernel(Generic[_P, _T]):
                 # pass_configs=pass_configs,
                 # compile_flags=compile_flags,
             )
+        elif execution_backend == "cutedsl":
+            assert is_cutedsl_target(target)
+            adapter = CuTeDSLKernelAdapter(
+                params=artifact.params,
+                result_idx=out_idx,
+                target=target,
+                func_or_mod=tilelang_func,
+                host_mod=artifact.host_mod,
+                device_mod=artifact.device_mod,
+                device_kernel_source=artifact.kernel_source,
+                verbose=verbose,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         else:
             # Handle invalid backend.
             raise ValueError(f"Invalid execution backend: {execution_backend}")
 
         return adapter
 
-    def _create_adapter_from_database(self,
-                                      params: list[KernelParam],
-                                      result_idx: list[int] | int,
-                                      target: str | Target,
-                                      func_or_mod: PrimFunc | tvm.runtime.Module,
-                                      kernel_global_source: str,
-                                      kernel_lib_path: str,
-                                      pass_configs: dict[str, Any] | None = None,
-                                      compile_flags: list[str] | None = None) -> BaseKernelAdapter:
+    def _create_adapter_from_database(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int] | int,
+        target: str | Target,
+        func_or_mod: PrimFunc | tvm.runtime.Module,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ) -> BaseKernelAdapter:
         target = self.target
         execution_backend = self.execution_backend
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            raise ValueError("DLPack backend is not supported for TileLang JIT.")
+        if execution_backend == "tvm_ffi":
+            adapter = TVMFFIKernelAdapter.from_database(
+                params=params,
+                result_idx=result_idx,
+                target=target,
+                func_or_mod=func_or_mod,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
+                kernel_lib_path=kernel_lib_path,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         elif execution_backend == "ctypes":
             adapter = CtypesKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -334,17 +390,33 @@ class JITKernel(Generic[_P, _T]):
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
+                kernel_lib_path=kernel_lib_path,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
+        elif execution_backend == "cutedsl":
+            adapter = CuTeDSLKernelAdapter.from_database(
+                params=params,
+                result_idx=result_idx,
+                target=target,
+                func_or_mod=func_or_mod,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -374,8 +446,7 @@ class JITKernel(Generic[_P, _T]):
         """
         return cls(func=tilelang_func, **kwargs)
 
-    def get_profiler(self,
-                     tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
+    def get_profiler(self, tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
         """
         Creates a profiler to benchmark the compiled runtime module.
 
@@ -389,10 +460,9 @@ class JITKernel(Generic[_P, _T]):
         Profiler
             A Profiler instance for benchmarking the runtime module.
         """
-        return Profiler(self.params, self.out_idx,
-                        tensor_supply_type).with_default_adapter(self.adapter)
+        return Profiler(self.params, self.out_idx, tensor_supply_type).with_default_adapter(self.adapter)
 
-    def get_kernel_source(self) -> str:
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
         """
         Returns the source code of the compiled kernel function.
 
@@ -401,14 +471,17 @@ class JITKernel(Generic[_P, _T]):
         str
             The source code of the compiled kernel function.
         """
-        if self.execution_backend in {"ctypes", "cython", "nvrtc"}:
-            return self.adapter.get_kernel_source()
+        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi", "cutedsl"}:
+            return self.adapter.get_kernel_source(kernel_only=kernel_only)
         return self.artifact.kernel_source
 
     def get_host_source(self) -> str:
         """
         Returns the source code of the host function.
         """
+        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi", "cutedsl"}:
+            return self.adapter.get_host_source()
+        assert self.artifact.host_mod is not None, "host_mod is not available"
         return str(self.artifact.host_mod)
 
     def run_once(self, func: Callable | None = None) -> None:
@@ -476,21 +549,19 @@ class JITKernel(Generic[_P, _T]):
                 dir_path = os.path.dirname(kernel_path)
                 if dir_path:
                     os.makedirs(dir_path, exist_ok=True)
-                with open(kernel_path, 'w') as f:
+                with open(kernel_path, "w") as f:
                     f.write(self.get_kernel_source())
             if host_path is not None:
                 dir_path = os.path.dirname(host_path)
                 if dir_path:
                     os.makedirs(dir_path, exist_ok=True)
-                with open(host_path, 'w') as f:
+                with open(host_path, "w") as f:
                     f.write(self.get_host_source())
         except Exception as e:
             logger.error(f"Failed to export sources: {e}")
 
     # Backward compatibility alias (deprecated)
-    def print_source_code(self,
-                          which: Literal["kernel", "host", "both"] = "kernel",
-                          file: str | None = None) -> None:
+    def print_source_code(self, which: Literal["kernel", "host", "both"] = "kernel", file: str | None = None) -> None:
         """
         Deprecated: use show_source() or export_sources() instead.
 
@@ -510,16 +581,14 @@ class JITKernel(Generic[_P, _T]):
         >>> # Old API (still works but deprecated)
         >>> jit_kernel.print_source_code(file="/tmp/kernel.cu")
         """
-        logger.warning(
-            "print_source_code is deprecated; use show_source() or export_sources() instead.")
+        logger.warning("print_source_code is deprecated; use show_source() or export_sources() instead.")
         if file is not None:
             # Historical behavior wrote only kernel source when file provided
             self.export_sources(kernel_path=file)
         else:
             self.show_source(which=which)
 
-    def update_tuner_result(self, latency: float, config: dict[str, Any],
-                            ref_latency: float) -> JITKernel:
+    def update_tuner_result(self, latency: float, config: dict[str, Any], ref_latency: float) -> JITKernel:
         """
         Updates the tuning results for this kernel.
 
@@ -620,8 +689,7 @@ class JITKernel(Generic[_P, _T]):
             verbose = self.verbose
         # Ensure target is set so nvcc picks correct arch via Target.current()
         with self.target:
-            return tl_nvcc.get_ptx_from_source(
-                code, compile_flags=self.compile_flags, verbose=verbose)
+            return tl_nvcc.get_ptx_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
 
     def show_ptx(self) -> None:
         """
@@ -683,8 +751,7 @@ class JITKernel(Generic[_P, _T]):
         if verbose is None:
             verbose = self.verbose
         with self.target:
-            return tl_nvcc.get_sass_from_source(
-                code, compile_flags=self.compile_flags, verbose=verbose)
+            return tl_nvcc.get_sass_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
 
     def show_sass(self) -> None:
         """
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 43c721bbb3635ebef6ad64bd98a6ecaca8fbe8e7..97f8385814e9c35e41227f6822c9f4cdb28f0dce 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 # from .parser import *
@@ -13,17 +14,25 @@ from . import overrides as _overrides  # noqa: F401
 from .v2 import *  # noqa: F401
 from .tir.ir import *  # noqa: F401
 from tilelang.layout import Layout, Fragment  # noqa: F401
-from .proxy import (
-    ptr,  # noqa: F401
-    make_tensor,  # noqa: F401
+from .proxy import ptr, make_tensor  # noqa: F401
+from .v2.annot import (
     Buffer,  # noqa: F401
     Tensor,  # noqa: F401
     StridedTensor,  # noqa: F401
     FragmentBuffer,  # noqa: F401
     SharedBuffer,  # noqa: F401
     LocalBuffer,  # noqa: F401
+    dyn,  # noqa: F401
+)
+from .loop import (
+    Parallel,  # noqa: F401
+    Persistent,  # noqa: F401
+    Pipelined,  # noqa: F401
+    serial,  # noqa: F401
+    unroll,  # noqa: F401
+    Serial,  # noqa: F401
+    Unroll,  # noqa: F401
 )
-from .loop import serial, Parallel, Persistent, Pipelined  # noqa: F401
 from .frame import has_let_value, get_let_value  # noqa: F401
 from .math_intrinsics import *  # noqa: F401
 from .kernel import (
@@ -47,12 +56,14 @@ from .allocate import (
     alloc_wgmma_desc,  # noqa: F401
     alloc_tcgen05_smem_desc,  # noqa: F401
     alloc_tcgen05_instr_desc,  # noqa: F401
+    empty,  # noqa: F401
 )
-from .copy import copy, c2d_im2col  # noqa: F401
-from .gemm import GemmWarpPolicy, gemm, gemm_v1, gemm_v2  # noqa: F401
-from .experimental.gemm_sp import gemm_sp  # noqa: F401
-from .fill import fill, clear  # noqa: F401
-from .reduce import (
+from .copy_op import copy, c2d_im2col  # noqa: F401
+from tilelang.tileop.base import GemmWarpPolicy  # noqa: F401
+from .gemm_op import gemm, gemm_v1, gemm_v2  # noqa: F401
+from .experimental.gemm_sp import gemm_sp, gemm_sp_v2  # noqa: F401
+from .fill_op import fill, clear  # noqa: F401
+from .reduce_op import (
     reduce,  # noqa: F401
     reduce_max,  # noqa: F401
     reduce_min,  # noqa: F401
@@ -64,8 +75,13 @@ from .reduce import (
     reduce_bitxor,  # noqa: F401
     cumsum,  # noqa: F401
     finalize_reducer,  # noqa: F401
+    warp_reduce_sum,  # noqa: F401
+    warp_reduce_max,  # noqa: F401
+    warp_reduce_min,  # noqa: F401
+    warp_reduce_bitand,  # noqa: F401
+    warp_reduce_bitor,  # noqa: F401
 )
-from .print import print, device_assert  # noqa: F401
+from .print_op import print, device_assert  # noqa: F401
 from .customize import (
     atomic_max,  # noqa: F401
     atomic_min,  # noqa: F401
@@ -82,12 +98,22 @@ from .customize import (
 )
 from .logical import any_of, all_of  # noqa: F401
 from .builtin import *  # noqa: F401
+from .builtin import __ldg as __ldg  # noqa: F401
 
 from .utils import index_to_coordinates  # noqa: F401
 
 from .symbolics import dynamic, symbolic  # noqa: F401
 from .annotations import (  # noqa: F401
-    use_swizzle, annotate_layout, annotate_safe_value, annotate_l2_hit_ratio,
+    use_swizzle,
+    annotate_layout,
+    annotate_safe_value,
+    annotate_l2_hit_ratio,
+    annotate_restrict_buffers,
+)
+
+from .random import (
+    rng_init,  # noqa: F401
+    rng_rand,  # noqa: F401
 )
 
 
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index d70355adb1cbf23aeb584fcabedc3db5c7ff05f7..e9338fa6efc8c49984fd042c839bfd3d7b66976f 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -15,16 +15,29 @@ with the appropriate memory scope.
 """
 
 from __future__ import annotations
-from typing import overload, Literal
+from typing import TypeVar, overload, Literal, Callable
+
+# Python 3.9 compatibility for advanced typing features (PEP 646)
+try:
+    from typing import TypeVarTuple, Unpack  # type: ignore[attr-defined]
+except Exception:
+    from typing_extensions import TypeVarTuple, Unpack  # type: ignore
 from tilelang import tvm as tvm
 from tvm.script import tir as T
 from tvm.tir import PrimExpr
 from tvm.script.parser.tir import block_attr
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import FloatImm, IntImm
+from .v2 import dtypes as _dtypes
+from .v2.dtypes import dtype as tl_dtype
+from .v2.builder import OutTensor
+from .v2.annot import Tensor, SharedBuffer, LocalBuffer, FragmentBuffer
+
+_Shapes = TypeVarTuple("_Shapes")
+_DType = TypeVar("_DType")
 
 
-def alloc_shared(shape, dtype, scope="shared.dyn"):
+def alloc_shared(shape: tuple[Unpack[_Shapes]], dtype: _DType, scope="shared.dyn") -> SharedBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a shared memory buffer for inter-thread communication.
 
     Args:
@@ -42,7 +55,7 @@ def alloc_shared(shape, dtype, scope="shared.dyn"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_local(shape, dtype, scope="local"):
+def alloc_local(shape: tuple[Unpack[_Shapes]], dtype: _DType, scope="local") -> LocalBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a local memory buffer for thread-private storage.
 
     Args:
@@ -56,7 +69,9 @@ def alloc_local(shape, dtype, scope="local"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_fragment(shape, dtype, scope="local.fragment"):
+def alloc_fragment(
+    shape: tuple[Unpack[_Shapes]], dtype: _DType, scope="local.fragment"
+) -> FragmentBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a fragment memory buffer for specialized operations.
 
     Args:
@@ -71,16 +86,11 @@ def alloc_fragment(shape, dtype, scope="local.fragment"):
 
 
 @overload
-def alloc_var(dtype: str, init: PrimExpr | int | float, scope: str = 'local.var') -> Buffer:
-    ...
+def alloc_var(dtype: str, init: PrimExpr | int | float, scope: str = "local.var") -> Buffer: ...
 
 
 @overload
-def alloc_var(dtype: str,
-              scope: str = 'local.var',
-              *,
-              init: PrimExpr | int | float | None = None) -> Buffer:
-    ...
+def alloc_var(dtype: str, scope: str = "local.var", *, init: PrimExpr | int | float | None = None) -> Buffer: ...
 
 
 def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
@@ -126,8 +136,7 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
             raise TypeError("Scope must be provided as a string in alloc_var.")
         parsed_scope = parsed_scope_arg
     elif len(args) > 2:
-        raise TypeError(
-            f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
+        raise TypeError(f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
 
     if not isinstance(parsed_scope, str):
         raise TypeError("Scope must be a string in alloc_var.")
@@ -135,7 +144,7 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
     buffer = T.alloc_buffer([1], dtype, scope=parsed_scope)
     if parsed_init is not None:
         if isinstance(parsed_init, (int, float, IntImm, FloatImm)):
-            block_attr({"tl.local_var_init": {buffer.data: parsed_init}})
+            block_attr({"tl.local_var_init": {buffer.data: tl_dtype(dtype)(parsed_init)}})
         else:
             T.buffer_store(buffer, parsed_init, 0)
     return buffer
@@ -150,7 +159,7 @@ def alloc_barrier(arrive_count: int):
     Returns:
         T.Buffer: A TVM buffer object allocated as a barrier
     """
-    return T.alloc_buffer([arrive_count], "uint64", scope="shared.barrier")
+    return T.alloc_buffer([arrive_count], _dtypes.uint64, scope="shared.barrier")
 
 
 def alloc_tmem(shape, dtype):
@@ -223,7 +232,7 @@ DescKind = Literal["wgmma", "tcgen05_smem", "tcgen05_instr"]
 
 def alloc_descriptor(
     kind: DescKind = "wgmma",
-    dtype: str = "uint64",
+    dtype: str = _dtypes.uint64,
 ):
     """Allocate a descriptor buffer for WGMMA and TCGEN5.MMA.
 
@@ -240,18 +249,33 @@ def alloc_descriptor(
     return T.alloc_buffer([1], dtype, scope=scope)
 
 
-def alloc_wgmma_desc(dtype: str = "uint64"):
+def alloc_wgmma_desc(dtype: str = _dtypes.uint64):
     return alloc_descriptor("wgmma", dtype=dtype)
 
 
-def alloc_tcgen05_smem_desc(dtype: str = "uint64"):
+def alloc_tcgen05_smem_desc(dtype: str = _dtypes.uint64):
     return alloc_descriptor("tcgen05_smem", dtype=dtype)
 
 
-def alloc_tcgen05_instruction_desc(dtype: str = "uint32"):
+def alloc_tcgen05_instruction_desc(dtype: str = _dtypes.uint32):
     return alloc_descriptor("tcgen05_instr", dtype=dtype)
 
 
 # Alias: short name consistent with imports
-def alloc_tcgen05_instr_desc(dtype: str = "uint32"):
+def alloc_tcgen05_instr_desc(dtype: str = _dtypes.uint32):
     return alloc_tcgen05_instruction_desc(dtype)
+
+
+@overload
+def empty(shape: tuple[Unpack[_Shapes]], dtype: str = _dtypes.float32) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
+
+
+def empty(*shape: Unpack[_Shapes], dtype: str = _dtypes.float32) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+    if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
+        return OutTensor(shape[0], dtype)
+    elif len(shape) == 2 and isinstance(shape[0], (tuple, list)) and isinstance(shape[1], str):
+        return OutTensor(shape[0], shape[1])
+    elif all([isinstance(x, (int, PrimExpr)) for x in shape]):
+        return OutTensor(shape, dtype)
+    else:
+        raise RuntimeError(f"Invalid shape {shape}")
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 12d3af4d37e4be3a18491fe5aae09ba4ed6a2b2e..43ca9c05119fe6667b1b4d9fd363320e53110150 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -1,16 +1,17 @@
 """Annotation helpers exposed on the TileLang language surface."""
-from __future__ import annotations
 
 from typing import Callable
 
 from tilelang.layout import Layout
 from tvm.script.parser.tir import attr, block_attr
+from tvm.tir import FloatImm
 
 __all__ = [
     "use_swizzle",
     "annotate_layout",
     "annotate_safe_value",
     "annotate_l2_hit_ratio",
+    "annotate_restrict_buffers",
 ]
 
 
@@ -49,5 +50,33 @@ def annotate_l2_hit_ratio(l2_hit_ratio_map: dict):
     _l2_hit_ratio_map = {}
     for buffer, hit_ratio in l2_hit_ratio_map.items():
         assert buffer.scope() == "global", "persistent L2 can only be applied to global buffers"
-        _l2_hit_ratio_map[buffer.data] = float(hit_ratio)
+        _l2_hit_ratio_map[buffer.data] = FloatImm("float32", float(hit_ratio))
     return block_attr({"l2_hit_ratio_map": _l2_hit_ratio_map})
+
+
+def annotate_restrict_buffers(*buffers):
+    """Mark the given buffer parameters as non-restrict.
+
+    This annotation tells codegen to omit the `__restrict__` qualifier for the
+    specified kernel buffer parameters. Use this when two (or more) buffers may
+    alias, for example overlapping slices from the same base tensor.
+
+    Example
+    -------
+    >>> @T.prim_func
+    ... def buggy_kernel(x: T.Tensor((N,), T.float32),
+    ...                  y: T.Tensor((N,), T.float32)):
+    ...     T.annotate_restrict_buffers(x, y)
+    ...     with T.Kernel(N, threads=32) as pid:
+    ...         y[pid] = x[pid] + 1
+    """
+    if not buffers:
+        return None
+    data_vars = []
+    for buf in buffers:
+        try:
+            data_vars.append(buf.data)
+        except Exception as e:
+            raise TypeError(f"annotate_restrict_buffers expects Buffer arguments, got {type(buf)}") from e
+    # Also return as block attribute (root block exists by default) for readability/tools.
+    return block_attr({"tl.non_restrict_params": data_vars})
diff --git a/tilelang/language/ast/__init__.py b/tilelang/language/ast/__init__.py
index 9d77454429dc76ba6cc843864c20e2a11b2f9132..6ab6249b14484ca534cdc3ee3cc5bdb6ce6c2767 100644
--- a/tilelang/language/ast/__init__.py
+++ b/tilelang/language/ast/__init__.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """Package tvm.script.ir_builder.tir"""
+
 from .ir import *  # noqa: F401
 from .ir import boolean as bool  # noqa: F401
 from .ir import buffer as Buffer  # noqa: F401
diff --git a/tilelang/language/ast/_ffi_api.py b/tilelang/language/ast/_ffi_api.py
index 518d57ea8b535123db78e14b61d280d83bf4368d..5cc74762a7089946a4c055d4623e506dce7985e7 100644
--- a/tilelang/language/ast/_ffi_api.py
+++ b/tilelang/language/ast/_ffi_api.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """FFI APIs"""
+
 import tvm.ffi
 
 tvm.ffi._init_api("script.ir_builder.tir", __name__)  # pylint: disable=protected-access
diff --git a/tilelang/language/ast/ir.py b/tilelang/language/ast/ir.py
index d6d5e05ef2f7337404a2b3fc8f4bbd4f81268e8e..6e4faad43a3186efe32a2ff657ba21cc067dfa45 100644
--- a/tilelang/language/ast/ir.py
+++ b/tilelang/language/ast/ir.py
@@ -92,7 +92,7 @@ from tvm.script.ir_builder.tir import frame
 
 def buffer(
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -143,7 +143,7 @@ def buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.Buffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -244,7 +244,7 @@ def func_ret(ret_type: Type) -> Type:
 def match_buffer(
     param: Union[Var, BufferLoad, BufferRegion],
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] = None,
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -266,11 +266,11 @@ def match_buffer(
     -------
     Match buffer from function parameter
     .. code-block:: python
-        A = T.match_buffer(a, (128, 128), dtype="float32")
+        A = T.match_buffer(a, (128, 128), dtype=T.float32)
 
     Match buffer from Buffer subregion
     .. code-block:: python
-        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype="float32")
+        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype=T.float32)
 
     Parameters
     ----------
@@ -320,7 +320,7 @@ def match_buffer(
             raise ValueError("Shape must be specified when binding input param")
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else "int32"
+        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else T.int32
         strides = [Var(s, idx_dtype) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
@@ -440,7 +440,7 @@ def block_attr(attrs: Dict[str, Any]) -> None:
 
 def alloc_buffer(
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -491,7 +491,7 @@ def alloc_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.AllocBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -537,7 +537,7 @@ class axis:  # pylint: disable=invalid-name
     def spatial(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The spatial block axis defining function.
 
@@ -558,13 +558,14 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisSpatial(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def reduce(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The reduced block axis defining function.
 
@@ -585,13 +586,14 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisReduce(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def scan(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The scanning block axis defining function.
 
@@ -612,13 +614,14 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisScan(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def opaque(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The opaque block axis defining function.
 
@@ -639,10 +642,11 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisOpaque(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
-    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[List[Var], Var]:
+    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = T.int32) -> Union[List[Var], Var]:
         """The block axis remapping function.
 
         Parameters
@@ -662,17 +666,15 @@ class axis:  # pylint: disable=invalid-name
             The iteration variables.
         """
         iter_vars = _ffi_api.AxisRemap(  # type: ignore[attr-defined] # pylint: disable=no-member
-            kinds, bindings, dtype)
+            kinds, bindings, dtype
+        )
         return iter_vars[0] if len(iter_vars) == 1 else iter_vars
 
     S = spatial  # pylint: disable=invalid-name
     R = reduce  # pylint: disable=invalid-name
 
 
-def serial(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -700,10 +702,7 @@ def serial(start: PrimExpr,
     return _ffi_api.Serial(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -731,10 +730,7 @@ def parallel(start: PrimExpr,
     return _ffi_api.Parallel(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -762,10 +758,7 @@ def vectorized(start: PrimExpr,
     return _ffi_api.Vectorized(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -837,7 +830,8 @@ def thread_binding(
         else:
             start = 0
     return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
-        start, stop, thread, annotations)
+        start, stop, thread, annotations
+    )
 
 
 def grid(*extents: PrimExpr) -> frame.ForFrame:
@@ -878,10 +872,10 @@ def Assert(condition: PrimExpr, message: str) -> frame.AssertFrame:  # pylint: d
 
 
 def LetStmt(  # pylint: disable=invalid-name
-        value: PrimExpr,
-        type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
-        *,
-        var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
+    value: PrimExpr,
+    type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
+    *,
+    var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
 ) -> frame.LetFrame:
     """Create a LetStmt binding
 
@@ -909,8 +903,8 @@ def LetStmt(  # pylint: disable=invalid-name
 
 
 def Let(  # pylint: disable=invalid-name
-        expr: PrimExpr,
-        where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
+    expr: PrimExpr,
+    where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
 ) -> PrimExpr:
     """Create a Let expression binding"""
     assert len(where) == 1, "T.Let only allows `where` to have exactly one element"
@@ -980,7 +974,8 @@ def realize(
         The result RealizeFrame.
     """
     return _ffi_api.Realize(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer_slice, storage_scope, condition)
+        buffer_slice, storage_scope, condition
+    )
 
 
 def allocate(
@@ -1012,7 +1007,8 @@ def allocate(
     if isinstance(condition, bool):
         condition = IntImm("bool", condition)
     return _ffi_api.Allocate(  # type: ignore[attr-defined] # pylint: disable=no-member
-        extents, dtype, scope, condition, annotations)
+        extents, dtype, scope, condition, annotations
+    )
 
 
 def allocate_const(
@@ -1048,7 +1044,8 @@ def allocate_const(
         np_data = np_data.reshape(extents)
 
     return _ffi_api.AllocateConst(  # type: ignore[attr-defined] # pylint: disable=no-member
-        ndarray.array(np_data), dtype, extents, annotations)
+        ndarray.array(np_data), dtype, extents, annotations
+    )
 
 
 def attr(node: Any, attr_key: str, value: Union[PrimExpr, str]) -> frame.AttrFrame:
@@ -1136,7 +1133,7 @@ def Else() -> frame.ElseFrame:  # pylint: disable=invalid-name
 
 def decl_buffer(
     shape,
-    dtype="float32",
+    dtype=T.float32,
     data=None,
     strides=None,
     elem_offset=None,
@@ -1187,7 +1184,7 @@ def decl_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.DeclBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1240,7 +1237,7 @@ def launch_thread(
     return _ffi_api.LaunchThread(thread, extent)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def env_thread(thread_tag: str, dtype: str = "int32") -> IterVar:
+def env_thread(thread_tag: str, dtype: str = T.int32) -> IterVar:
     """Bind a var to thread env
 
     Parameters
@@ -1297,7 +1294,8 @@ def buffer_store(
     if isinstance(value, bool) and buffer.dtype == "bool":
         value = IntImm("bool", value)
     return _ffi_api.BufferStore(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer, value, expr_indices)
+        buffer, value, expr_indices
+    )
 
 
 def prefetch(
@@ -1464,10 +1462,7 @@ def boolean(expr: Optional[PrimExpr] = None, is_size_var: bool = False) -> PrimE
     return _ffi_api.Boolean(expr, is_size_var)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def handle(dtype: Optional[str] = None,
-           storage_scope: str = "global",
-           *,
-           is_size_var: bool = False) -> Var:
+def handle(dtype: Optional[str] = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
@@ -1661,13 +1656,13 @@ def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer:
     args = []
     for name, i in zip(params.keys(), identity + identity):
         if isinstance(i, int):
-            args.append(Var(name, "int32"))
+            args.append(Var(name, T.int32))
         else:
             args.append(Var(name, i.dtype))
     res = combiner(*args)
     if not isinstance(res, tuple):
         res = (res,)
-    return CommReducer(args[:num_args // 2], args[num_args // 2:], res, identity)
+    return CommReducer(args[: num_args // 2], args[num_args // 2 :], res, identity)
 
 
 def index_map(
@@ -1700,16 +1695,15 @@ def target(
         The target.
     """
     if not isinstance(target_config, (str, dict)):
-        raise ValueError(
-            f"T.target expected a config dict or string, but got {type(target_config)}")
+        raise ValueError(f"T.target expected a config dict or string, but got {type(target_config)}")
     if host is not None and not isinstance(host, (str, dict, Target)):
-        raise ValueError("T.target expected the host to be "
-                         "a config dict, string, or T.target, "
-                         f"but got {type(host)}")
+        raise ValueError(f"T.target expected the host to be a config dict, string, or T.target, but got {type(host)}")
     if isinstance(target_config, dict) and "host" in target_config and host is not None:
-        raise ValueError("T.target expects to either receive the host "
-                         "as part of the target's config dictionary, "
-                         "or as a separate argument, but not both.")
+        raise ValueError(
+            "T.target expects to either receive the host "
+            "as part of the target's config dictionary, "
+            "or as a separate argument, but not both."
+        )
     return Target(target_config, host)
 
 
@@ -1742,7 +1736,6 @@ class meta_var:  # pylint: disable=invalid-name
         self.value = value
 
     def __iter__(self):
-
         def f():
             for i in self.value:
                 yield meta_var(i)
@@ -1754,7 +1747,6 @@ class meta_var:  # pylint: disable=invalid-name
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -1874,7 +1866,6 @@ vscale = _op_wrapper(_tir_op.vscale)
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index 6e5fa88c8568a6f3ec069157465532855cf5afa6..a801f75f4c8bf02eb1d3058c810c66994b28d159 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -1,13 +1,11 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
-"""Atomic operations for tilelang."""
+"""Atomic operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 import tilelang.language as T
-from tvm import ir, tir
+from tvm import ir
 from tvm.tir import PrimExpr, Buffer, BufferRegion, Var, op
-from tilelang.language.utils import buffer_region_to_tile_region, buffer_load_to_tile_region
-from tilelang.utils.language import get_buffer_region_from_load, legalize_pairwise_extents
+from tilelang.utils.language import to_buffer_region, legalize_pairwise_extents
 
 _MEMORY_ORDER_ID_MAP = {
     "relaxed": 0,
@@ -19,10 +17,7 @@ _MEMORY_ORDER_ID_MAP = {
 }
 
 
-def atomic_max(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+def atomic_max(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Perform an atomic maximum on the value stored at dst with an optional memory-order.
 
@@ -60,15 +55,18 @@ def atomic_max(dst: Buffer,
     return_type = dst.dtype if return_prev else "handle"
 
     if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
+        return T.call_extern(return_type, func_name, T.address_of(dst), value)
     else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_extern(
+            return_type,
+            func_name,
+            T.address_of(dst),
+            value,
+            _MEMORY_ORDER_ID_MAP[memory_order],
+        )
 
 
-def atomic_min(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Atomically update the value at dst to the minimum of its current value and value.
 
@@ -108,16 +106,18 @@ def atomic_min(dst: Buffer,
     return_type = dst.dtype if return_prev else "handle"
 
     if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
+        return T.call_extern(return_type, func_name, T.address_of(dst), value)
     else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_extern(
+            return_type,
+            func_name,
+            T.address_of(dst),
+            value,
+            _MEMORY_ORDER_ID_MAP[memory_order],
+        )
 
 
-def atomic_add(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False,
-               use_tma: bool = False) -> PrimExpr:
+def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False, use_tma: bool = False) -> PrimExpr:
     """
     Atomically add `value` into `dst`, returning a handle to the operation.
 
@@ -189,11 +189,17 @@ def atomic_add(dst: Buffer,
         func_name = "AtomicAddRet" if return_prev else "AtomicAdd"
         return_type = dst.dtype if return_prev else "handle"
 
+        # Pass destination by pointer to match device signature
         if memory_order is None:
-            return T.call_extern(return_type, func_name, dst, value)
+            return T.call_extern(return_type, func_name, T.address_of(dst), value)
         else:
-            return T.call_extern(return_type, func_name, dst, value,
-                                 _MEMORY_ORDER_ID_MAP[memory_order])
+            return T.call_extern(
+                return_type,
+                func_name,
+                T.address_of(dst),
+                value,
+                _MEMORY_ORDER_ID_MAP[memory_order],
+            )
 
     if isinstance(dst, Buffer) and isinstance(value, Buffer):
         ir.assert_structural_equal(dst.shape, value.shape)
@@ -203,36 +209,18 @@ def atomic_add(dst: Buffer,
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
     src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    def _to_region(data, access_type, extent):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            zeros = [tir.IntImm("int32", 0) for _ in extent]
-            return buffer_load_to_tile_region(tir.BufferLoad(data, zeros), access_type, extent)
-        elif isinstance(data, tir.BufferRegion):
-            return buffer_region_to_tile_region(data, access_type, extent)
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return buffer_load_to_tile_region(data, access_type, extent)
-            return buffer_region_to_tile_region(region, access_type, extent)
-        else:
-            return buffer_load_to_tile_region(data, access_type, extent)
-
-    value = _to_region(value, "r", src_extent)
-    dst = _to_region(dst, "w", dst_extent)
+    value = to_buffer_region(value, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
     # Note: tile-region-based atomic operations don't support return_prev yet
     # This would need to be implemented in the tile runtime
     if return_prev:
-        raise NotImplementedError(
-            "return_prev is not supported for tile-region-based atomic operations")
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
 
     if memory_order is None:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma, 0)
+        return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, use_tma, 0)
     else:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma,
-                             _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, use_tma, _MEMORY_ORDER_ID_MAP[memory_order])
 
 
 def atomic_addx2(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> PrimExpr:
@@ -347,7 +335,7 @@ def atomic_load(src: Buffer, memory_order: str = "seq_cst") -> PrimExpr:
         >>> counter = T.Tensor([1], "int64", name="counter")
         >>> current_count = atomic_load(counter, memory_order="relaxed")
     """
-    return T.call_extern(src.dtype, "AtomicLoad", src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_extern(src.dtype, "AtomicLoad", T.address_of(src), _MEMORY_ORDER_ID_MAP[memory_order])
 
 
 def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> PrimExpr:
@@ -400,4 +388,4 @@ def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> P
         >>> log_counter = T.Tensor([1], "int64", name="log_counter")
         >>> atomic_store(log_counter, 0)  # Reset counter atomically
     """
-    return T.call_extern("handle", "AtomicStore", dst, src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_extern("handle", "AtomicStore", T.address_of(dst), src, _MEMORY_ORDER_ID_MAP[memory_order])
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
index e40d1f0d052b5640983c8ac5717aaf6688eb3518..2932656ca4edd6c806d77b41adbf2f3e60beaa12 100644
--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -1,4 +1,5 @@
-"""The language interface for tl programs."""
+"""Builtin operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 from tilelang import tvm as tvm
@@ -59,6 +60,35 @@ def create_list_of_mbarrier(*args: Any) -> Call:
         raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
 
 
+def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
+    """Explicitly load via CUDA read-only data cache.
+
+    Prefer calling with a BufferLoad: `T.__ldg(x[i])` emits `__ldg(&x[i])` on CUDA.
+    On non-CUDA backends, falls back to a regular load.
+
+    Args:
+        load_or_buf: A `BufferLoad` like `x[i]`, or a `Buffer`.
+        index: Optional index when passing a `Buffer` directly.
+
+    Returns:
+        PrimExpr: The loaded value.
+    """
+    if isinstance(load_or_buf, BufferLoad):
+        dtype = load_or_buf.dtype
+        return tir.call_intrin(str(dtype), tir.op.Op.get("tl.__ldg"), load_or_buf)
+    if isinstance(load_or_buf, tir.Buffer):
+        if index is None:
+            raise ValueError("T.__ldg(Buffer, index) requires an index when passing a Buffer.")
+        idx = index
+        if isinstance(index, (list, tuple)):
+            if len(index) != 1:
+                raise ValueError("T.__ldg currently supports 1D flattened indices.")
+            idx = index[0]
+        bl = BufferLoad(load_or_buf, [idx])
+        return tir.call_intrin(str(load_or_buf.dtype), tir.op.Op.get("tl.__ldg"), bl)
+    raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
+
+
 def get_mbarrier(*args):
     """Retrieve a memory barrier operation.
 
@@ -150,38 +180,32 @@ def set_max_nreg(reg_count: int, is_inc: int):
 
 
 def inc_max_nreg(reg_count: int):
-    """Increment the maximum number of registers to use.
-    """
+    """Increment the maximum number of registers to use."""
     return set_max_nreg(reg_count, 1)
 
 
 def dec_max_nreg(reg_count: int):
-    """Decrement the maximum number of registers to use.
-    """
+    """Decrement the maximum number of registers to use."""
     return set_max_nreg(reg_count, 0)
 
 
 def annotate_producer_reg_dealloc(reg_count: int = 24):
-    """Annotate the producer reg dealloc.
-    """
+    """Annotate the producer reg dealloc."""
     return dec_max_nreg(reg_count)
 
 
 def annotate_consumer_reg_alloc(reg_count: int = 240):
-    """Annotate the consumer reg alloc.
-    """
+    """Annotate the consumer reg alloc."""
     return inc_max_nreg(reg_count)
 
 
 def no_set_max_nreg():
-    """Disable the maximum register limit setting.
-    """
+    """Disable the maximum register limit setting."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.no_set_max_nreg"))
 
 
 def disable_warp_group_reg_alloc():
-    """Disable the warp group reg alloc.
-    """
+    """Disable the warp group reg alloc."""
     return no_set_max_nreg()
 
 
@@ -296,7 +320,9 @@ def warpgroup_wait(num_mma: int):
     return tir.call_intrin("handle", tir.op.Op.get("tl.warpgroup_wait"), num_mma)
 
 
-def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_lane_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the logical lane index of the calling thread within a warp.
 
     Parameters
@@ -321,7 +347,9 @@ def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_lane_idx"), warp_size_expr)
 
 
-def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx_sync(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index, assuming the warp's threads are converged.
 
     Parameters
@@ -345,7 +373,9 @@ def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_idx_sync"), warp_size_expr)
 
 
-def get_warp_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index without synchronizing the warp.
 
     Parameters
@@ -400,8 +430,7 @@ def get_warp_group_idx(
         args.append(warp_size_expr)
     if warps_per_group_expr is not None:
         if warp_size_expr is None:
-            raise ValueError("get_warp_group_idx expects `warp_size` when specifying "
-                             "`warps_per_group`.")
+            raise ValueError("get_warp_group_idx expects `warp_size` when specifying `warps_per_group`.")
         args.append(warps_per_group_expr)
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_group_idx"), *args)
 
@@ -430,10 +459,9 @@ def shuffle_elect(thread_extent: int) -> PrimExpr:
     return tir.call_intrin("bool", tir.op.Op.get("tl.tl_shuffle_elect"), thread_extent)
 
 
-def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
-                            offset: int | PrimExpr = 0,
-                            num_regs: int | PrimExpr | None = None,
-                            dtype: str | None = None):
+def warpgroup_fence_operand(
+    buffer_or_ptr: tir.Buffer | PrimExpr, offset: int | PrimExpr = 0, num_regs: int | PrimExpr | None = None, dtype: str | None = None
+):
     """Insert a warpgroup fence for the destination accumulator registers.
 
     This prevents NVCC from sinking uses of accumulator fragments past the corresponding
@@ -488,7 +516,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 data_ptr,
                 convert(offset),
                 convert(num_regs),
-            ))
+            )
+        )
 
     if isinstance(buffer_or_ptr, tir.Buffer):
         data_ptr = buffer_or_ptr.data
@@ -502,8 +531,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 if isinstance(dim, tir.IntImm):
                     total_elems *= int(dim)
                 else:
-                    raise ValueError(
-                        "warpgroup_fence_operand requires num_regs when buffer shape is symbolic.")
+                    raise ValueError("warpgroup_fence_operand requires num_regs when buffer shape is symbolic.")
             bits_per_elem = DataType(dtype).bits
             num_regs = (total_elems * bits_per_elem + 31) // 32
     elif isinstance(buffer_or_ptr, BufferRegion):
@@ -540,9 +568,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 bits_per_elem = DataType(dtype).bits
                 num_regs = (total_elems * bits_per_elem + 31) // 32
             else:
-                raise ValueError(
-                    "warpgroup_fence_operand requires num_regs when BufferRegion extent is symbolic."
-                )
+                raise ValueError("warpgroup_fence_operand requires num_regs when BufferRegion extent is symbolic.")
         return evaluate(
             tir.call_intrin(
                 "handle",
@@ -551,7 +577,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 data_ptr,
                 convert(offset),
                 convert(num_regs),
-            ))
+            )
+        )
     else:
         data_ptr = buffer_or_ptr
         # Try to infer dtype from common pointer expressions when not provided
@@ -574,9 +601,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 except Exception:
                     inferred = None
             if inferred is None:
-                raise ValueError(
-                    "dtype must be provided when passing a pointer expression and cannot be inferred."
-                )
+                raise ValueError("dtype must be provided when passing a pointer expression and cannot be inferred.")
             dtype = inferred
         if num_regs is None:
             raise ValueError("num_regs must be provided when passing a pointer expression.")
@@ -589,7 +614,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
             data_ptr,
             convert(offset),
             convert(num_regs),
-        ))
+        )
+    )
 
 
 def wait_wgmma(id: int):
@@ -644,7 +670,7 @@ def shfl_xor(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_xor", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -657,7 +683,7 @@ def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Cal
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_down", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -670,12 +696,11 @@ def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call)
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_up", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xFFFFFFFF, value, offset)
 
 
 def sync_threads(barrier_id: int = None, arrive_count: int = None):
-    """Synchronize all threads in a block.
-    """
+    """Synchronize all threads in a block."""
     args = []
     if barrier_id is not None:
         args.append(barrier_id)
@@ -685,8 +710,7 @@ def sync_threads(barrier_id: int = None, arrive_count: int = None):
 
 
 def sync_global():
-    """Synchronize all threads in the entire grid.
-    """
+    """Synchronize all threads in the entire grid."""
     tx, ty, tz = get_thread_bindings()
     ex, ey, ez = get_block_extents()
     print(tx, ty, tz, ex, ey, ez)
@@ -695,8 +719,7 @@ def sync_global():
 
 
 def sync_grid():
-    """Synchronize all threads in a grid.
-    """
+    """Synchronize all threads in a grid."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.sync_grid"))
 
 
@@ -712,12 +735,10 @@ def initialize_wgmma_descriptor(
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or
-                                               descriptor.shape[0] != 1):
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
     return evaluate(
         tir.call_intrin(
@@ -728,7 +749,8 @@ def initialize_wgmma_descriptor(
             layout_type_,
             int(leading_byte_offset),
             int(stride_byte_offset),
-        ))
+        )
+    )
 
 
 def initialize_tcgen05_descriptor(
@@ -745,12 +767,10 @@ def initialize_tcgen05_descriptor(
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or
-                                               descriptor.shape[0] != 1):
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
     return evaluate(
         tir.call_intrin(
@@ -763,7 +783,8 @@ def initialize_tcgen05_descriptor(
             int(base_offset),
             tir.IntImm("int32", 1 if leading_is_absolute else 0),
             int(swizzle_mode),
-        ))
+        )
+    )
 
 
 def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimExpr:
@@ -780,27 +801,21 @@ def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimEx
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and len(
-            descriptor.shape) != 1 or descriptor.shape[0] != 1:
+    if isinstance(descriptor, tir.Buffer) and len(descriptor.shape) != 1 or descriptor.shape[0] != 1:
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
-    return evaluate(
-        tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor,
-                        offset))
+    return evaluate(tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor, offset))
 
 
 def loop_break():
-    """Break out of the innermost loop.
-    """
+    """Break out of the innermost loop."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.loop_break"))
 
 
 def cp_async_barrier_noinc(barrier_id: int | PrimExpr | tir.Call):
-    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc.
-    """
+    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
 
 
diff --git a/tilelang/language/copy.py b/tilelang/language/copy_op.py
similarity index 52%
rename from tilelang/language/copy.py
rename to tilelang/language/copy_op.py
index 4ad857b5caa7cae49d187316a8db89a111b74ac2..0b55c410c75832cd4ec153373585644c2e07b067 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy_op.py
@@ -1,26 +1,28 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Copy operations exposed on the TileLang language surface."""
 
+from __future__ import annotations
 from typing import Literal
 from tilelang import language as T
 from tilelang.utils.language import (
+    to_buffer_region,
     get_buffer_region_from_load,
     legalize_pairwise_extents,
 )
 from tvm import ir, tir
-from tilelang.language.utils import buffer_region_to_tile_region, buffer_load_to_tile_region
 
 
-def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
-         dst: tir.Buffer | tir.BufferLoad,
-         coalesced_width: int | None = None,
-         disable_tma: bool = False,
-         eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
+def copy(
+    src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
+    dst: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
+    coalesced_width: int | None = None,
+    disable_tma: bool = False,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+):
     """Copy data between memory regions.
 
     Args:
         src (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Source memory region
-        dst (Union[tir.Buffer, tir.BufferLoad]): Destination memory region
+        dst (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Destination memory region
         coalesced_width (Optional[int], optional): Width for coalesced memory access. Defaults to None.
 
     Raises:
@@ -28,6 +30,22 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
 
     Returns:
         tir.Call: A handle to the copy operation
+
+    Range handling notes:
+    - Accepts `Buffer`/`BufferRegion`/`BufferLoad` on either side. Extents are
+      derived as follows: `Buffer -> shape`, `BufferRegion -> [r.extent]`,
+      `BufferLoad -> extents from its inferred/encoded region`.
+    - If both `src` and `dst` are scalar `BufferLoad` without region extents,
+      lowers to a direct store: `dst[...] = src`.
+    - If one side is missing extents, it is treated as all-ones with the other
+      side's rank to enable broadcasting.
+    - Extents are right-aligned and legalized via `legalize_pairwise_extents`:
+      per tail-dimension, equal keeps as-is, a `1` broadcasts to the other,
+      otherwise a conservative `tir.max` is used to remain safe for dynamic
+      shapes.
+    - The finalized extents are encoded with `tl.region` via `to_buffer_region`
+      and passed through to the backend; low-level loop construction and any
+      scope-specific decisions happen during lowering.
     """
     if isinstance(src, tir.Buffer) and isinstance(dst, tir.Buffer):
         ir.assert_structural_equal(src.shape, dst.shape)
@@ -50,47 +68,23 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
     src_extent = get_extent(src)
     dst_extent = get_extent(dst)
     # Combine the nested if statements into a single if statement as suggested by SIM102
-    if (src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and
-            isinstance(dst, tir.BufferLoad)):
+    if src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
         # check if the case is like this:
         # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
         # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
         return tir.BufferStore(dst.buffer, src, dst.indices)
 
     assert src_extent or dst_extent, "Can't deduce copy extents from args"
-    # Treat missing extent as length-matched ones to enable broadcasting logic.
+    # Treat missing extent as length-matched ones to enable broadcasting.
     src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
 
-    # Align and broadcast extents from the right (tail) side independently
-    # for src and dst, so we can pass them unchanged into _to_region.
-    # Rules per-dim from the right:
-    # - equal -> keep both
-    # - one is 1 -> set that side to the other side's dim
-    # - otherwise -> error
+    # Align and broadcast extents from the right (tail) side.
     src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    def _to_region(data, access_type, extent):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            # Restrict a raw buffer to the computed copy extent by creating
-            # a BufferLoad at origin and passing the extents explicitly.
-            zeros = [tir.IntImm("int32", 0) for _ in extent]
-            return buffer_load_to_tile_region(tir.BufferLoad(data, zeros), access_type, extent)
-        elif isinstance(data, tir.BufferRegion):
-            return buffer_region_to_tile_region(data, access_type, extent)
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return buffer_load_to_tile_region(data, access_type, extent)
-            return buffer_region_to_tile_region(region, access_type, extent)
-        else:
-            return buffer_load_to_tile_region(data, access_type, extent)
-
     # Use legalized extents for src and dst respectively.
-    src = _to_region(src, "r", src_extent)
-    dst = _to_region(dst, "w", dst_extent)
+    src = to_buffer_region(src, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
     if coalesced_width is None:
         coalesced_width = -1  # PrimExpr can not be None
@@ -98,19 +92,20 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
         eviction_policy = 0
     else:
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.copy"), src, dst, coalesced_width,
-                           disable_tma, eviction_policy)
-
-
-def c2d_im2col(img: tir.Buffer,
-               col: tir.Buffer,
-               nhw_step: tir.PrimExpr,
-               c_step: tir.PrimExpr,
-               kernel: int,
-               stride: int,
-               dilation: int,
-               pad: int,
-               eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, coalesced_width, disable_tma, eviction_policy)
+
+
+def c2d_im2col(
+    img: tir.Buffer,
+    col: tir.Buffer,
+    nhw_step: tir.PrimExpr,
+    c_step: tir.PrimExpr,
+    kernel: int,
+    stride: int,
+    dilation: int,
+    pad: int,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+):
     """Perform im2col transformation for 2D convolution.
 
     Args:
@@ -130,6 +125,18 @@ def c2d_im2col(img: tir.Buffer,
         eviction_policy = 0
     else:
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.c2d_im2col"), img.access_ptr("r"),
-                           col.access_ptr("w"), nhw_step, c_step, kernel, stride, dilation, pad,
-                           eviction_policy)
+    img_region = to_buffer_region(img, access_type="r")
+    col_region = to_buffer_region(col, access_type="w")
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.c2d_im2col"),
+        img_region,
+        col_region,
+        nhw_step,
+        c_step,
+        kernel,
+        stride,
+        dilation,
+        pad,
+        eviction_policy,
+    )
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 0830c22dc8420d37a85b49e3d24302cf5753a3e9..ae4e754f7cd3a845ce153b54427805438b15045f 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,8 +1,9 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Some customized operations frequently used in tensor programming, exposed on the TileLang language surface."""
 
+from __future__ import annotations
 import tilelang.language as T
 from tvm.tir import PrimExpr, Buffer, op
+from tilelang.utils.language import bits_product, prim_expr_equal
 from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store  # noqa: F401
 
 
@@ -46,19 +47,22 @@ def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
     Returns:
         Buffer: A new buffer view with the specified shape
     """
+    assert prim_expr_equal(bits_product(shape, src.dtype), bits_product(src.shape, src.dtype)), (
+        f"T.reshape/view shape check failed. src {src} src.shape: {src.shape}, src.dtype: {src.dtype}, target shape: {shape}, target dtype: {src.dtype}"
+    )
     return T.Tensor(shape, src.dtype, src.data)
 
 
 def view(src: Buffer, shape: list[PrimExpr] | None = None, dtype: str | None = None) -> Buffer:
-    """
-         Return a Tensor view of the input buffer with an optional new shape and dtype.
+    """Return a Tensor view of the input buffer with an optional new shape and dtype.
 
-         If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
-         """
+    If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
+    """
     if shape is None:
         shape = src.shape
     if dtype is None:
         dtype = src.dtype
+    assert prim_expr_equal(bits_product(shape, dtype), bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
     return T.Tensor(shape, dtype, src.data)
 
 
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index fc511c0071216c7e2e1b2859fe8dfcd96c7c5f0c..1eaac6805e571e16f02d72d7334c0c2d6111e47d 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -1,9 +1,19 @@
 """The language interface for tl programs."""
-from __future__ import annotations
 
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from __future__ import annotations
+from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
+from tilelang.utils.language import (
+    to_buffer_region,
+    retrieve_shape,
+    retrieve_stride,
+    retrieve_offset,
+    prim_expr_equal,
+)
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,
+)
 
 
 def gemm_sp(
@@ -63,17 +73,18 @@ def gemm_sp(
     K_A = A_sparse.shape[0] if transpose_A else A_sparse.shape[1]
     K_B = B.shape[1] if transpose_B else B.shape[0]
     assert K_A * 2 == K_B, f"T.gemm_sp K shape check failed: K_A = {K_A}, K_B = {K_B}"
-    Aptr = A_sparse.access_ptr("r")
-    Bptr = B.access_ptr("r")
-    Cptr = C.access_ptr("rw")
-    Eptr = E.access_ptr("r")
+    # Build tl.region descriptors for operands
+    A_arg = to_buffer_region(A_sparse, access_type="r")
+    E_arg = to_buffer_region(E, access_type="r")
+    B_arg = to_buffer_region(B, access_type="r")
+    C_arg = to_buffer_region(C, access_type="rw")
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.gemm_sp"),
-        Aptr,
-        Eptr,
-        Bptr,
-        Cptr,
+        tir.op.Op.get("tl.tileop.gemm_sp"),
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
         transpose_A,
         transpose_B,
         M,
@@ -84,3 +95,129 @@ def gemm_sp(
         k_pack,
         wg_wait,
     )
+
+
+# experimental currently, for fast compilation
+def gemm_sp_v2(
+    A_sparse: tir.Buffer | tir.Var,
+    E: tir.Buffer | tir.Var,
+    B: tir.Buffer | tir.Var,
+    C: tir.Buffer | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    transpose_E: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+):
+    """Perform a General Matrix Multiplication (GEMM) operation.
+
+    This function computes C = A @ B where A and B can optionally be transposed.
+    The operation supports various warp policies and accumulation modes.
+
+    Args:
+        A_sparse (Union[tir.Buffer, tir.Var]): First input matrix, contains only non-zero elements
+        E (Union[tir.Buffer, tir.Var]): The metadata of A_sparse, noted as E
+        B (Union[tir.Buffer, tir.Var]): Second input matrix
+        C (Union[tir.Buffer, tir.Var]): Output matrix for results
+        transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
+        transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
+        policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
+        clear_accum (bool, optional): Whether to clear accumulator before computation. Defaults to False.
+        k_pack (int, optional): Number of k dimensions packed into a single warp. Defaults to 1.
+        wg_wait (int, optional): Warp group wait count. Defaults to 0.
+
+    Returns:
+        tir.Call: A handle to the GEMM operation
+
+    Raises:
+        AssertionError: If the K dimensions of matrices A and B don't match
+    """
+
+    def legalize_arguments(arg: tir.Buffer | tir.Var):
+        """Convert let-bound variables to their corresponding buffers.
+
+        Args:
+            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+
+        Returns:
+            Union[tir.Buffer, tir.Var]: The legalized argument
+        """
+        if isinstance(arg, tir.Var) and T.has_let_value(arg):
+            return T.get_let_value(arg).buffer
+        return arg
+
+    A_sparse = legalize_arguments(A_sparse)
+    E = legalize_arguments(E)
+    B = legalize_arguments(B)
+    C = legalize_arguments(C)
+
+    A_region = to_buffer_region(A_sparse)
+    E_region = to_buffer_region(E)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
+
+    A_shape = retrieve_shape(A_sparse)
+    E_shape = retrieve_shape(E)  # nolint: F841
+    B_shape = retrieve_shape(B)
+    C_shape = retrieve_shape(C)
+
+    A_stride = retrieve_stride(A_sparse)
+    B_stride = retrieve_stride(B)
+
+    assert len(C_shape) == 2, "current only support C as a 2D tensor"
+    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
+    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
+    if len(A_shape) > 2:
+        for i in range(len(A_shape) - 2):
+            assert A_shape[i] == 1, (
+                "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+    if len(B_shape) > 2:
+        for i in range(len(B_shape) - 2):
+            assert B_shape[i] == 1, (
+                "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+
+    M, N = C_shape
+    K = 2 * (A_shape[-2] if transpose_A else A_shape[-1])
+    K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(K, K_B), f"T.gemm_sp K shape check failed: K_A (wo sparse) = {K}, K_B = {K_B}"
+
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
+    A_offset = retrieve_offset(A_sparse)
+    B_offset = retrieve_offset(B)
+    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
+    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    E_arg = buffer_region_to_tile_region(E_region, "r", [r for r in E_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.gemm_sp_py"),
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        transpose_E,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+    )
diff --git a/tilelang/language/fastmath.py b/tilelang/language/fastmath.py
index 0146f53ac7f63906f4daf806add6f57b22d2b1d4..c77fad34c5d96953ad28b422c279fecee704d8b5 100644
--- a/tilelang/language/fastmath.py
+++ b/tilelang/language/fastmath.py
@@ -1,3 +1,5 @@
+"""Fast math operations exposed on the TileLang language surface."""
+
 from tvm import tir
 
 
diff --git a/tilelang/language/fill.py b/tilelang/language/fill_op.py
similarity index 65%
rename from tilelang/language/fill.py
rename to tilelang/language/fill_op.py
index 74aeb2648bd89db38758c90ac41b01598de7a18e..a093a84599009038c10a5a23ed530c743ad066e1 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill_op.py
@@ -1,14 +1,9 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Fill operations exposed on the TileLang language surface."""
 
+from __future__ import annotations
 from tvm import tir
 from tilelang.language import has_let_value, get_let_value
-from tilelang.utils.language import get_buffer_region_from_load
-from tilelang.language.utils import (
-    buffer_to_tile_region,
-    buffer_region_to_tile_region,
-    buffer_load_to_tile_region,
-)
+from tilelang.utils.language import get_buffer_region_from_load, to_buffer_region
 
 
 def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.PrimExpr):
@@ -25,26 +20,20 @@ def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.Prim
     if isinstance(buffer, tir.Var) and has_let_value(buffer):
         buffer = get_let_value(buffer)
 
-    # Convert to a tl.region descriptor (PrimExpr) with write access
-    region_call = None
+    # Build tl.region as argument
     if isinstance(buffer, tir.Buffer):
-        region_call = buffer_to_tile_region(buffer, "w")
+        extents = list(buffer.shape)
     elif isinstance(buffer, tir.BufferRegion):
         extents = [r.extent for r in buffer.region]
-        region_call = buffer_region_to_tile_region(buffer, "w", extents)
     elif isinstance(buffer, tir.BufferLoad):
         region = get_buffer_region_from_load(buffer)
         if region is not None:
             extents = [r.extent for r in region.region]
-            region_call = buffer_region_to_tile_region(region, "w", extents)
         else:
-            # Fallback: treat element access as 1-extent per dim
-            region_call = buffer_load_to_tile_region(buffer, "w", [1] * len(buffer.indices))
+            extents = [tir.IntImm("int32", 1) for _ in buffer.indices]
     else:
-        # As-is fallback (rare): pass through for downstream handling
-        region_call = buffer
-
-    return tir.call_intrin("handle", tir.op.Op.get("tl.fill"), region_call, value)
+        extents = []
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.fill"), to_buffer_region(buffer, access_type="w", extents=extents), value)
 
 
 def clear(buffer: tir.Buffer | tir.Var):
@@ -66,8 +55,7 @@ def clear(buffer: tir.Buffer | tir.Var):
         elif isinstance(buffer_region, tir.BufferLoad):
             region = get_buffer_region_from_load(buffer_region)
             if region is None:
-                raise ValueError(
-                    f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
+                raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
             return fill(region, 0)
         else:
             raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
diff --git a/tilelang/language/frame.py b/tilelang/language/frame.py
index 8e6d59268f441869b875b7180ac2434e5da16bc6..7e60f46ee98da3e3283744b39a575068e7dfed09 100644
--- a/tilelang/language/frame.py
+++ b/tilelang/language/frame.py
@@ -1,6 +1,6 @@
 """Override the LetFrame to print a message when entering the frame."""
-from __future__ import annotations
 
+from __future__ import annotations
 from tvm.ffi import register_object as _register_object
 from tvm.tir import Var, PrimExpr, BufferLoad, BufferRegion
 from tvm.ir import Range
@@ -30,7 +30,7 @@ class FrameStack:
             item: The frame object to push onto the stack
         """
         self._stack.append(item)
-        if hasattr(item, 'var') and hasattr(item, 'value'):
+        if hasattr(item, "var") and hasattr(item, "value"):
             self._var_value_map[item.var] = item.value
 
     def pop(self):
@@ -44,7 +44,7 @@ class FrameStack:
         """
         if self._stack:
             item = self._stack.pop()
-            if hasattr(item, 'var'):
+            if hasattr(item, "var"):
                 self._var_value_map.pop(item.var, None)
             return item
         raise IndexError(f"{self.__class__.__name__} is empty")
@@ -130,8 +130,7 @@ class LetFrame(TIRFrame):
                     is_block_load = True
                     break
             if is_block_load:
-                self.value = BufferRegion(self.value.buffer,
-                                          [Range(x.base, x.lanes) for x in indices])
+                self.value = BufferRegion(self.value.buffer, [Range(x.base, x.lanes) for x in indices])
 
         _get_let_stack().push(self)
         return self.var
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm_op.py
similarity index 68%
rename from tilelang/language/gemm.py
rename to tilelang/language/gemm_op.py
index 0f01582f0d0309c46813d826c23ba054638f9d67..e2bda2b9423b9cffcc17106fe50e55c664f60997 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm_op.py
@@ -1,17 +1,19 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""GEMM (General Matrix Multiplication) operators exposed on the TileLang language surface."""
 
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from __future__ import annotations
+from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
 from tilelang.utils.language import (
     to_buffer_region,
     retrieve_shape,
     retrieve_stride,
-    retrieve_ptr,
     retrieve_offset,
     prim_expr_equal,
 )
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,
+)
 from tilelang.env import env as _env
 
 
@@ -51,29 +53,31 @@ def _gemm_impl(
     C = legalize_arguments(C)
     mbar = legalize_arguments(mbar) if mbar is not None else None
 
-    # Normalize A/B/C to BufferRegion to pass into tl.gemm
-    A = to_buffer_region(A)
-    B = to_buffer_region(B)
-    C = to_buffer_region(C)
+    # Normalize A/B/C to BufferRegion for shape/stride/offset analysis
+    A_region = to_buffer_region(A)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
 
-    A_shape = retrieve_shape(A)
-    B_shape = retrieve_shape(B)
-    C_shape = retrieve_shape(C)
+    A_shape = retrieve_shape(A_region)
+    B_shape = retrieve_shape(B_region)
+    C_shape = retrieve_shape(C_region)
 
-    A_stride = retrieve_stride(A)
-    B_stride = retrieve_stride(B)
+    A_stride = retrieve_stride(A_region)
+    B_stride = retrieve_stride(B_region)
 
     assert len(C_shape) == 2, "current only support C as a 2D tensor"
     assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
     assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
     if len(A_shape) > 2:
         for i in range(len(A_shape) - 2):
-            assert A_shape[i] == 1, \
+            assert A_shape[i] == 1, (
                 "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
     if len(B_shape) > 2:
         for i in range(len(B_shape) - 2):
-            assert B_shape[i] == 1, \
+            assert B_shape[i] == 1, (
                 "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
 
     M, N = C_shape
     K = A_shape[-2] if transpose_A else A_shape[-1]
@@ -83,18 +87,42 @@ def _gemm_impl(
     stride_a = A_stride[-2]
     stride_b = B_stride[-2]
 
-    A_offset = retrieve_offset(A)
-    B_offset = retrieve_offset(B)
+    A_offset = retrieve_offset(A_region)
+    B_offset = retrieve_offset(B_region)
     assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
     assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
     offset_a = A_offset[-1]
     offset_b = B_offset[-1]
 
-    mbarptr = retrieve_ptr(mbar, "rw") if mbar is not None else tir.const(0, "uint32")
-    C_coords = [r.min for r in C.region]
-    return tir.call_intrin("handle", tir.op.Op.get(op_key), A, B, C, transpose_A, transpose_B, M, N,
-                           K, policy, clear_accum, stride_a, stride_b, offset_a, offset_b, k_pack,
-                           wg_wait, mbarptr, C_coords[0], C_coords[1])
+    mbar = to_buffer_region(mbar, access_type="rw") if mbar is not None else tir.const(0, T.uint32)
+    C_coords = [r.min for r in C_region.region]
+    # Convert BufferRegion to tl.region calls for arguments
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get(op_key),
+        A_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+        mbar,
+        C_coords[0],
+        C_coords[1],
+    )
 
 
 # Public wrappers
@@ -112,7 +140,7 @@ def gemm_v1(
 ):
     """GEMM v1: use op tl.gemm."""
     return _gemm_impl(
-        "tl.gemm",
+        "tl.tileop.gemm",
         A,
         B,
         C,
@@ -141,7 +169,7 @@ def gemm_v2(
 ):
     """GEMM v2: use op tl.gemm_py."""
     return _gemm_impl(
-        "tl.gemm_py",
+        "tl.tileop.gemm_py",
         A,
         B,
         C,
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index 54b78d3d94e9a8ea1a4ecfc4a82830349732d074..73f7ed949079a381c1c929dc0860eb133bbad2c5 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -1,6 +1,6 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Kernel launching language interface in TileLang."""
 
+from __future__ import annotations
 from collections import deque
 from tvm import tir
 from tvm.tir import Var
@@ -108,8 +108,7 @@ class KernelLaunchFrame(TIRFrame):
         _get_current_stack().push(self)
 
         last_block_frame = self.frames[-1]
-        assert isinstance(last_block_frame,
-                          BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
+        assert isinstance(last_block_frame, BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
 
         maybe_cpu = last_block_frame.annotations.get("tilelang.is_cpu_kernel_frame", False)
 
@@ -304,56 +303,48 @@ def Kernel(
 
 
 def get_thread_binding(dim: int = 0) -> Var:
-    """Returns the thread binding for the given dimension.
-    """
+    """Returns the thread binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_binding(dim)
 
 
 def get_thread_bindings() -> list[Var]:
-    """Returns all three thread bindings.
-    """
+    """Returns all three thread bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_bindings()
 
 
 def get_block_binding(dim: int = 0) -> Var:
-    """Returns the block binding for the given dimension.
-    """
+    """Returns the block binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_binding(dim)
 
 
 def get_block_bindings() -> list[Var]:
-    """Returns all three block bindings.
-    """
+    """Returns all three block bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_bindings()
 
 
 def get_thread_extent(dim: int = 0) -> int:
-    """Returns the thread extent for the given dimension.
-    """
+    """Returns the thread extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extent(dim)
 
 
 def get_thread_extents() -> list[int]:
-    """Returns all three thread extents.
-    """
+    """Returns all three thread extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extents()
 
 
 def get_block_extent(dim: int = 0) -> int:
-    """Returns the block extent for the given dimension.
-    """
+    """Returns the block extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extent(dim)
 
 
 def get_block_extents() -> list[int]:
-    """Returns all three block extents.
-    """
+    """Returns all three block extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extents()
diff --git a/tilelang/language/logical.py b/tilelang/language/logical.py
index a09088e68ce2d125ee226f55f52c8f7b8089df53..66f0a2e2b56a2a60e82555492978b0f9b101c117 100644
--- a/tilelang/language/logical.py
+++ b/tilelang/language/logical.py
@@ -1,4 +1,5 @@
-"""The language interface for tl programs."""
+"""Logical operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 from tilelang import language as T
@@ -36,8 +37,7 @@ def any_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
 
@@ -71,7 +71,6 @@ def all_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
index 85f2acd88e443531e6c16fccd180e9f3be9c0385..4fbd4e9f8644f179f06774db713e3dc65494267a 100644
--- a/tilelang/language/loop.py
+++ b/tilelang/language/loop.py
@@ -1,12 +1,13 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Loop related language interfaces in TileLang."""
 
+from __future__ import annotations
 from typing import Any
 from tvm import tir
 from tvm.tir import IntImm
 import tvm.script.ir_builder.tir as tb_tir
-from .v2.builder import SerialForWithStep
+from .v2.builder import SerialForWithStep, UnrollForWithStep
 from tilelang import _ffi_api
+from tvm.script.ir_builder.tir import frame
 
 
 def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
@@ -94,11 +95,9 @@ def Pipelined(
     return _ffi_api.Pipelined(start, stop, num_stages, order, stage, sync, group)
 
 
-def serial(start: tir.PrimExpr,
-           stop: tir.PrimExpr | None = None,
-           step: tir.PrimExpr | None = None,
-           *,
-           annotations: dict[str, Any] | None = None):
+def serial(
+    start: tir.PrimExpr, stop: tir.PrimExpr | None = None, step: tir.PrimExpr | None = None, *, annotations: dict[str, Any] | None = None
+) -> frame.ForFrame:
     step_is_one = False
     step_is_one |= isinstance(step, int) and step == 1
     step_is_one |= isinstance(step, IntImm) and step.value == 1
@@ -109,3 +108,79 @@ def serial(start: tir.PrimExpr,
             stop = start
             start = IntImm(start.dtype, 0) if hasattr(start, "dtype") else 0
         return SerialForWithStep(start, stop, step, annotations=annotations)
+
+
+def unroll(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    explicit: bool = False,
+    unroll_factor: int | None = None,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """The unrolled For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    step : PrimExpr
+        The step size of the iteration.
+
+    explicit : bool
+        Whether to explicitly unroll the loop.
+
+    unroll_factor : int
+        The unroll factor of the loop.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+
+    step_is_one = False
+    if stop is None:
+        stop = start
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
+
+    # Ensure annotations has {"pragma_unroll_explicit": True} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": explicit}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", explicit)
+
+    if unroll_factor is not None:
+        # check pragma_unroll_explicit must be False
+        if annotations.get("pragma_unroll_explicit", True):
+            raise ValueError("pragma_unroll_explicit must be True when unroll_factor is not None")
+        annotations.update({"pragma_unroll_factor": unroll_factor})
+
+    if step is None or step_is_one:
+        return tb_tir.unroll(start, stop, annotations=annotations)
+    else:
+        return UnrollForWithStep(start, stop, step, annotations=annotations)
+
+
+# "Serial" and "Unroll" are aliases of "T.serial" and "T.unroll". We use uppercase to emphasize that they are tile-level loops.
+
+
+def Serial(*args, **kwargs):
+    return serial(*args, **kwargs)
+
+
+def Unroll(*args, **kwargs):
+    return unroll(*args, **kwargs)
diff --git a/tilelang/language/math_intrinsics.py b/tilelang/language/math_intrinsics.py
index 39cab27adc5967322ddb6270caf2e95bfae7cc0f..6dfb617e5521ff3e9f5d5c244764f65c67086cb7 100644
--- a/tilelang/language/math_intrinsics.py
+++ b/tilelang/language/math_intrinsics.py
@@ -1,9 +1,11 @@
+"""Common math intrinsics exposed on the TileLang language surface."""
+
 from tvm import tir
 
 
 def _validate_rounding_mode(rounding_mode):
     """Validate that the rounding mode is one of the supported IEEE modes"""
-    valid_modes = {'rn', 'rz', 'ru', 'rd'}
+    valid_modes = {"rn", "rz", "ru", "rd"}
     if isinstance(rounding_mode, str) and rounding_mode in valid_modes:
         return
     raise ValueError(f"Invalid rounding mode '{rounding_mode}'. Must be one of: {valid_modes}")
diff --git a/tilelang/language/overrides/parser.py b/tilelang/language/overrides/parser.py
index 01d59b6078ebedb84ffc73f23443f9de0d46b072..0b2fcc44f965c08207283e35966607ef69405c25 100644
--- a/tilelang/language/overrides/parser.py
+++ b/tilelang/language/overrides/parser.py
@@ -1,5 +1,4 @@
 """TVMScript parser overrides tailored for TileLang."""
-from __future__ import annotations
 
 from functools import partial
 
@@ -60,8 +59,12 @@ def tilelang_visit_assign(self, node: doc.Assign) -> None:  # pylint: disable=un
             lhs.ctx = load_ctx
             lhs_value = self.eval_expr(lhs)
             lhs.ctx = store_ctx
-            if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                    len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+            if (
+                isinstance(lhs_value, BufferLoad)
+                and lhs_value.buffer.scope() == "local.var"
+                and len(lhs_value.indices) == 1
+                and lhs_value.indices[0] == 0
+            ):
                 T.buffer_store(lhs_value.buffer, rhs, indices=[0])
                 continue
 
@@ -108,8 +111,12 @@ def tilelang_visit_aug_assign(self, node: doc.AugAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
@@ -133,8 +140,12 @@ def tilelang_visit_ann_assign(self, node: doc.AnnAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
diff --git a/tilelang/language/parser/entry.py b/tilelang/language/parser/entry.py
index aa98cf569932d811bf4a5434a8cfe28983b7f73e..53316d8c28388422ca4b139080b081e4fcb93210 100644
--- a/tilelang/language/parser/entry.py
+++ b/tilelang/language/parser/entry.py
@@ -18,6 +18,7 @@
 # which is part of the TVM project (https://tvm.apache.org/).
 # ruff: noqa
 """The entry point of TVM parser for tir."""
+
 import inspect
 from typing import Callable, Optional, Union
 
@@ -29,9 +30,7 @@ from tvm.script.parser._core import parse, scan_macro, utils
 from tvm.script.parser.core.parser import Parser, ScriptMacro
 
 
-def prim_func(func: Optional[Callable] = None,
-              private: bool = False,
-              check_well_formed=True) -> Union[PrimFunc, Callable]:
+def prim_func(func: Optional[Callable] = None, private: bool = False, check_well_formed=True) -> Union[PrimFunc, Callable]:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -149,8 +148,7 @@ def macro(*args, hygienic: bool = True) -> Callable:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 class BufferProxy:
@@ -159,7 +157,7 @@ class BufferProxy:
     def __call__(
         self,
         shape,
-        dtype="float32",
+        dtype=T.float32,
         data=None,
         strides=None,
         elem_offset=None,
diff --git a/tilelang/language/parser/operation.py b/tilelang/language/parser/operation.py
index 43774947ecb567f7b94d8e231cf00e39b7b1df66..473da43275a6252588288f3bafda1572d314b978 100644
--- a/tilelang/language/parser/operation.py
+++ b/tilelang/language/parser/operation.py
@@ -17,7 +17,6 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """The tir expression operation registration"""
-from __future__ import annotations
 
 from tvm import tir
 from tvm.ffi.runtime_ctypes import DataType, DataTypeCode
@@ -57,11 +56,9 @@ def _register_expr_op(ty: type):  # pylint: disable=invalid-name
         return dtype[0:index]
 
     def _auto_broadcast(a, b, op):
-
         if isinstance(a, int):
             if hasattr(b, "dtype"):
-                if (DataType(b.dtype).type_code == DataTypeCode.INT or
-                        DataType(b.dtype).type_code == DataTypeCode.UINT):
+                if DataType(b.dtype).type_code == DataTypeCode.INT or DataType(b.dtype).type_code == DataTypeCode.UINT:
                     a = IntImm(_get_type_str(b.dtype), a)
                 elif DataType(b.dtype).type_code == DataTypeCode.FLOAT:
                     a = FloatImm(_get_type_str(b.dtype), a)
@@ -77,8 +74,7 @@ def _register_expr_op(ty: type):  # pylint: disable=invalid-name
 
         assert isinstance(a, tir.PrimExpr), "Operand should be a PrimExpr."
         if isinstance(b, int):
-            if (DataType(a.dtype).type_code == DataTypeCode.INT or
-                    DataType(a.dtype).type_code == DataTypeCode.UINT):
+            if DataType(a.dtype).type_code == DataTypeCode.INT or DataType(a.dtype).type_code == DataTypeCode.UINT:
                 b = IntImm(_get_type_str(a.dtype), b)
             elif DataType(a.dtype).type_code == DataTypeCode.FLOAT:
                 b = FloatImm(_get_type_str(a.dtype), b)
@@ -87,10 +83,10 @@ def _register_expr_op(ty: type):  # pylint: disable=invalid-name
 
         if DataType(a.dtype).lanes == DataType(b.dtype).lanes:
             return op(a, b)
-        elif (DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_a = tir.Broadcast(a, DataType(b.dtype).lanes)
             return op(broadcast_a, b)
-        elif (DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_b = tir.Broadcast(b, DataType(a.dtype).lanes)
             return op(a, broadcast_b)
         else:
diff --git a/tilelang/language/parser/parser.py b/tilelang/language/parser/parser.py
index 3aa720d4e6efa785ce6aad45759e4bad4de980e2..4cac0ad74f56de23c3edd2145fae444bef281423 100644
--- a/tilelang/language/parser/parser.py
+++ b/tilelang/language/parser/parser.py
@@ -146,8 +146,7 @@ def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -
         res = value.__enter__()
         IRBuilder.name(var_name, res)
         return res
-    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and
-                                                  not self.var_table.exist(value)):
+    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and not self.var_table.exist(value)):
         IRBuilder.name(var_name, value)
         return value
     else:
@@ -191,8 +190,7 @@ def visit_for(self: Parser, node: doc.For) -> None:
     if not isinstance(for_frame, T.frame.ForFrame):
         self.report_error(
             node.iter,
-            "Expect the for loop to be one of the following: "
-            "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
+            "Expect the for loop to be one of the following: range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
         )
     with self.var_table.with_frame():
         with for_frame as iters:
@@ -361,8 +359,7 @@ def visit_with(self: Parser, node: doc.With) -> None:
         for item in node.items:
             frame = self.eval_expr(item.context_expr)
             if not isinstance(frame, Frame):
-                self.report_error(item.context_expr,
-                                  "Invalid context expression in the with-statement.")
+                self.report_error(item.context_expr, "Invalid context expression in the with-statement.")
             rhs = stack.enter_context(frame)
             if item.optional_vars is not None:
                 self.eval_assign(target=item.optional_vars, source=rhs, bind_value=bind_with_value)
@@ -505,8 +502,7 @@ def visit_if(self: Parser, node: doc.If) -> None:
                 with self.var_table.with_frame():
                     self.visit_body(node.orelse)
         else:
-            self.report_error(node.test,
-                              f"If condition must be a boolean expression, but got {predicate}")
+            self.report_error(node.test, f"If condition must be a boolean expression, but got {predicate}")
 
 
 @dispatch.register(token="tir", type_name="Assert")
diff --git a/tilelang/language/print.py b/tilelang/language/print_op.py
similarity index 83%
rename from tilelang/language/print.py
rename to tilelang/language/print_op.py
index 08e18f426019ebd691fd3bc18b2aed358880ebbb..bbaa119ed55d7adbe0637ffff56d617e4d616454 100644
--- a/tilelang/language/print.py
+++ b/tilelang/language/print_op.py
@@ -26,9 +26,7 @@ def print_var(var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
 
 
 @macro
-def print_var_with_condition(condition: tir.PrimExpr,
-                             var: tir.PrimExpr,
-                             msg: str = "") -> tir.PrimExpr:
+def print_var_with_condition(condition: tir.PrimExpr, var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints a TIR primitive expression (PrimExpr) if a given condition is True.
 
@@ -44,10 +42,7 @@ def print_var_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_global_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_global_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
     """
@@ -55,17 +50,13 @@ def print_global_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
     else:
         tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_shared_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_shared_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -81,15 +72,11 @@ def print_shared_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
-                                         buffer: tir.Buffer,
-                                         elems: int,
-                                         msg: str = "") -> tir.PrimExpr:
+def print_fragment_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -111,10 +98,7 @@ def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_local_buffer_with_condition(condition: tir.PrimExpr,
-                                      buffer: tir.Buffer,
-                                      elems: int,
-                                      msg: str = "") -> tir.PrimExpr:
+def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -130,8 +114,7 @@ def print_local_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 from tilelang.utils.target import check_cuda_availability
@@ -201,7 +184,7 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
             return print_fragment_buffer_with_condition(condition, buffer, elems, msg)
@@ -212,7 +195,7 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
             return print_shared_buffer_with_condition(condition, buffer, elems, msg)
@@ -234,5 +217,4 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
 
     else:
         # Unsupported object type.
-        raise ValueError(
-            f"Unexpected type: {type(obj)}. Supported types are tir.Buffer and tir.PrimExpr.")
+        raise ValueError(f"Unexpected type: {type(obj)}. Supported types are tir.Buffer and tir.PrimExpr.")
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index 2c5a372f51478ad75fbecd1f04d646cb98553ed7..b739de6b54871a28dc5f85acf03dadc0d5bb12d2 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,7 +1,8 @@
-"""The language interface for tl programs."""
+"""Buffer/Tensor proxy in TileLang."""
 
 from __future__ import annotations
-from typing import Any, SupportsIndex, TYPE_CHECKING
+
+from typing import Any, SupportsIndex, TYPE_CHECKING, Generic, TypeVar
 from collections.abc import Sequence
 from typing_extensions import Self
 
@@ -51,11 +52,9 @@ class BufferProxy:
             return self(keys)
         return self(*keys)  # type: ignore[attr-defined] # pylint: disable=no-member
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+    ) -> Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -76,6 +75,7 @@ class BaseTensorProxy:
     customizable default values for scope, alignment, and offset factors. It implements
     the core functionality for creating TIR buffers with specific memory configurations.
     """
+
     default_scope = "global"
     default_align = 0
     default_offset_factor = 0
@@ -118,11 +118,9 @@ class BaseTensorProxy:
             keys = (keys,)
         return self(*keys)
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+    ) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -151,19 +149,10 @@ class TensorProxy(BaseTensorProxy):
             strides.append(s)
         return tuple(reversed(strides))
 
-    def __call__(self,
-                 shape: tuple[Any] | PrimExpr | int,
-                 dtype: str = "float32",
-                 data=None,
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: tuple[Any] | PrimExpr | int, dtype: str = "float32", data=None, scope=None) -> tir.Buffer:
         if isinstance(shape, (int, PrimExpr)):
             shape = (shape,)
-        return super().__call__(
-            shape,
-            dtype=dtype,
-            strides=TensorProxy._construct_strides(shape),
-            data=data,
-            scope=scope)
+        return super().__call__(shape, dtype=dtype, strides=TensorProxy._construct_strides(shape), data=data, scope=scope)
 
 
 class StridedTensorProxy(BaseTensorProxy):
@@ -172,11 +161,7 @@ class StridedTensorProxy(BaseTensorProxy):
     This class implements the default tensor proxy with global memory scope, with the stride information required.
     """
 
-    def __call__(self,
-                 shape: tuple[Any],
-                 strides: tuple[Any],
-                 dtype: str = "float32",
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: tuple[Any], strides: tuple[Any], dtype: str = "float32", scope=None) -> tir.Buffer:
         if len(shape) != len(strides):
             raise ValueError("Invalid shape/strides' dimensions")
         return super().__call__(shape, dtype=dtype, strides=strides, scope=scope)
@@ -188,6 +173,7 @@ class FragmentBufferProxy(BaseTensorProxy):
     This class represents tensor proxies specifically for local fragment memory,
     typically used in GPU tensor core operations.
     """
+
     default_scope = "local.fragment"
 
 
@@ -197,6 +183,7 @@ class SharedBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for dynamic shared memory,
     commonly used in GPU shared memory operations.
     """
+
     default_scope = "shared.dyn"
 
 
@@ -206,6 +193,7 @@ class LocalBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for local memory scope,
     typically used for temporary computations in GPU kernels.
     """
+
     default_scope = "local"
 
 
@@ -216,15 +204,12 @@ Buffer = BufferProxy()  # pylint: disable=invalid-name
 if TYPE_CHECKING:
 
     class BaseTensor:
-
         def __class_getitem__(cls, key):
             return cls
 
-        def __getitem__(self, key) -> Any:
-            ...
+        def __getitem__(self, key) -> Any: ...
 
-        def __setitem__(self, key, value) -> None:
-            ...
+        def __setitem__(self, key, value) -> None: ...
 
         def __init__(
             self,
@@ -238,31 +223,26 @@ if TYPE_CHECKING:
             offset_factor=None,
             buffer_type="",
             axis_separators=None,
-        ):
-            ...
+        ): ...
 
         @classmethod
-        def from_ptr(cls,
-                     pointer_var: Var,
-                     shape: Sequence[PrimExpr, ...],
-                     dtype: str = "float32",
-                     strides: tuple[PrimExpr, ...] = None) -> Self:
-            ...
+        def from_ptr(
+            cls, pointer_var: Var, shape: Sequence[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+        ) -> Self: ...
 
-    class Tensor(BaseTensor):
-        ...
+    class Tensor(BaseTensor): ...
 
-    class StridedTensor(BaseTensor):
-        ...
+    class StridedTensor(BaseTensor): ...
 
-    class FragmentBuffer(BaseTensor):
-        ...
+    class FragmentBuffer(BaseTensor): ...
 
-    class SharedBuffer(BaseTensor):
-        ...
+    class SharedBuffer(BaseTensor): ...
 
-    class LocalBuffer(BaseTensor):
-        ...
+    class LocalBuffer(BaseTensor): ...
+
+    _T = TypeVar("_T")
+
+    class Ref(Generic[_T], tir.Var): ...
 else:
     Tensor = TensorProxy()  # pylint: disable=invalid-name
     StridedTensor = StridedTensorProxy()  # pylint: disable=invalid-name
@@ -270,11 +250,10 @@ else:
     SharedBuffer = SharedBufferProxy()  # pylint: disable=invalid-name
     LocalBuffer = LocalBufferProxy()  # pylint: disable=invalid-name
 
+    class Ref: ...
+
 
-def ptr(dtype: str | None = None,
-        storage_scope: str = "global",
-        *,
-        is_size_var: bool = False) -> Var:
+def ptr(dtype: str | None = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
@@ -296,8 +275,5 @@ def ptr(dtype: str | None = None,
     return handle(dtype=dtype, storage_scope=storage_scope, is_size_var=is_size_var)
 
 
-def make_tensor(ptr: Var,
-                shape: tuple[PrimExpr, ...],
-                dtype: str = "float32",
-                strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+def make_tensor(ptr: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
     return Tensor.from_ptr(ptr, shape, dtype, strides)
diff --git a/tilelang/language/random.py b/tilelang/language/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..a76625be2ef33d5b8fd10b1ff20a9d6e52483aad
--- /dev/null
+++ b/tilelang/language/random.py
@@ -0,0 +1,44 @@
+from tvm import tir
+import tilelang.language as T
+
+
+# https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-overview
+def rng_init(seed, seq=None, off=0):
+    """Initialize CUDA curand random number generator state
+
+    Parameters
+    ----------
+    seed : PrimExpr
+        Random seed value.
+    seq : PrimExpr
+        Sequence number for parallel random number generation.
+    off : PrimExpr
+        Offset number for parallel random number generation.
+
+    Returns
+    -------
+    state : PrimExpr
+        The random number generator state handle.
+    """
+    seed = tir.convert(seed)
+    if seq is None:
+        bx = T.get_block_binding()
+        ex = T.kernel.get_thread_extent()
+        tx = T.get_thread_binding()
+        id = tx + bx * ex
+        seq = tir.convert(id)
+    else:
+        seq = tir.convert(seq)
+    off = tir.convert(off)
+    return tir.call_intrin("void", tir.op.Op.get("tl.rng_init"), seed, seq, off)
+
+
+def rng_rand():
+    """Generate a 32-bit unsigned random integer
+
+    Returns
+    -------
+    random_value : PrimExpr
+        A 32-bit unsigned random integer.
+    """
+    return tir.call_intrin("uint32", tir.op.Op.get("tl.rng_rand"))
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce_op.py
similarity index 59%
rename from tilelang/language/reduce.py
rename to tilelang/language/reduce_op.py
index 5b895c41affcb988b1a3601bbd77282687393613..9db56df0d14f0923db59eb3f3444ba9cd3f45f65 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce_op.py
@@ -1,9 +1,9 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Reduce operations exposed on the TileLang language surface."""
 
+from __future__ import annotations
 from tvm import tir
 from tilelang.language import copy, macro, alloc_shared, alloc_fragment
-from tilelang.language.utils import buffer_to_tile_region
+from tilelang.utils.language import to_buffer_region, retrieve_shape, _get_buffer
 from tilelang.utils.language import is_shared, is_fragment
 from tvm.script.ir_builder import IRBuilder
 
@@ -14,6 +14,9 @@ def _legalize_dim(buffer: tir.Buffer, dim: int):
     return dim
 
 
+_REDUCE_OP_KEY = "tl.tileop.reduce"
+
+
 def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
     """Perform a reduction operation on a buffer along a specified dimension.
 
@@ -28,15 +31,13 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
         tir.Call: Handle to the reduction operation
     """
     # input shape: [X, d, Y], expected output shape: [X, Y] or [X, 1, Y]
-    expected_shapes = [
-        buffer.shape[:dim] + buffer.shape[dim + 1:],
-        buffer.shape[:dim] + [1] + buffer.shape[dim + 1:]
-    ]
+    expected_shapes = [buffer.shape[:dim] + buffer.shape[dim + 1 :], buffer.shape[:dim] + [1] + buffer.shape[dim + 1 :]]
     if list(out.shape) not in expected_shapes:
-        expected_shapes_str = ' or '.join(map(str, expected_shapes))
+        expected_shapes_str = " or ".join(map(str, expected_shapes))
         raise ValueError(
             f"Invalid reduce output shape, buffer shape is {buffer.shape}, dim is {dim}, "
-            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}")
+            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}"
+        )
 
     @macro
     def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
@@ -51,9 +52,9 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             copy(buffer, red_frag_in)
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(red_frag_in, "r"),
-                buffer_to_tile_region(red_frag_out, "w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -66,9 +67,9 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             copy(buffer, red_frag_in)
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(red_frag_in, "r"),
-                buffer_to_tile_region(out, "w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -79,9 +80,9 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
 
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(buffer, "r"),
-                buffer_to_tile_region(red_frag_out, "w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -90,9 +91,9 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
         elif is_fragment(buffer) and is_fragment(out):
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(buffer, "r"),
-                buffer_to_tile_region(out, "w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -241,26 +242,60 @@ def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: boo
 
 
 @macro
-def cumsum_fragment(src: tir.Buffer, dst: tir.Buffer, dim: int, reverse: bool) -> tir.PrimExpr:
-    cumsum_smem = alloc_shared(src.shape, src.dtype, "shared.dyn")
+def cumsum_fragment(
+    src: tir.Buffer,
+    dst: tir.Buffer,
+    dim: int,
+    reverse: bool,
+) -> tir.PrimExpr:
+    """
+    Compute cumulative sum for fragment buffers by copying to shared memory first.
+
+    This macro handles cumulative sum operations on fragment buffers by first copying
+    the data to shared memory, performing the cumsum operation, and then copying back.
+
+    Args:
+        src: Source buffer (Buffer, BufferRegion, or BufferLoad) containing input data.
+        dst: Destination buffer (Buffer, BufferRegion, or BufferLoad) for output data.
+        dim: Dimension along which to compute cumulative sum.
+        reverse: If True, compute cumulative sum in reverse order.
+
+    Returns:
+        tir.PrimExpr: A handle to the cumulative sum operation.
+    """
+    src_shape = retrieve_shape(src)
+    src_buffer = _get_buffer(src)
+    # Get dtype from the buffer
+    if isinstance(src, tir.Buffer):
+        dtype = src.dtype
+    else:
+        dtype = src_buffer.dtype
+    cumsum_smem = alloc_shared(src_shape, dtype, "shared.dyn")
     copy(src, cumsum_smem)
     tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.cumsum"),
-        cumsum_smem.access_ptr("r"),
-        cumsum_smem.access_ptr("w"),
+        tir.op.Op.get("tl.tileop.cumsum"),
+        to_buffer_region(cumsum_smem, access_type="r"),
+        to_buffer_region(cumsum_smem, access_type="w"),
         dim,
         reverse,
     )
     copy(cumsum_smem, dst)
 
 
-def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse: bool = False):
+def cumsum(
+    src: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
+    dst: tir.Buffer | tir.BufferRegion | tir.BufferLoad | None = None,
+    dim: int = 0,
+    reverse: bool = False,
+):
     """
     Compute the cumulative sum of `src` along `dim`, writing results to `dst`.
 
     Negative `dim` indices are normalized (Python-style). If `dst` is None, the operation is performed in-place into `src`. Raises ValueError when `dim` is out of bounds for `src.shape`. When `src.scope() == "local.fragment"`, this delegates to `cumsum_fragment`; otherwise it emits the `tl.cumsum` intrinsic.
 
+    Supports Buffer, BufferRegion, and BufferLoad inputs, allowing operations on buffer slices/regions.
+
     Examples:
         A 1D inclusive scan that writes the result into a separate shared-memory buffer:
 
@@ -284,25 +319,46 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
         ...         T.cumsum(src=tile, dim=1, reverse=True)
         ...         T.copy(tile, B)
 
+        Operating on a buffer region (slice):
+
+        >>> import tilelang.language as T
+        >>> @T.prim_func
+        ... def kernel_region(InputG_fragment: T.Tensor((128,), "float32"), chunk_size: T.int32):
+        ...     with T.Kernel(1, threads=128):
+        ...         i = T.int32(0)
+        ...         T.cumsum(InputG_fragment[i * chunk_size:(i + 1) * chunk_size], dim=0)
+
     Returns:
         tir.Call: A handle to the emitted cumulative-sum operation.
     """
 
-    shape = src.shape
-    if dim >= len(shape) or dim <= -len(shape):
+    # Get shape from src (supports Buffer, BufferRegion, BufferLoad)
+    shape = retrieve_shape(src)
+    if dim >= len(shape) or dim < -len(shape):
         raise ValueError(f"Dimension {dim} is out of bounds for buffer with shape {shape}")
     if dim < 0:
         dim = len(shape) + dim
 
     if dst is None:
         dst = src
-    if src.scope() == "local.fragment":
+    else:
+        # Validate that dst shape matches src shape
+        dst_shape = retrieve_shape(dst)
+        if len(dst_shape) != len(shape):
+            raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (rank mismatch)")
+        # Check each dimension matches
+        for i in range(len(shape)):
+            if not tir.analysis.expr_deep_equal(dst_shape[i], shape[i]):
+                raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (dim {i} mismatch)")
+
+    # Check if src is a fragment buffer
+    if is_fragment(src):
         return cumsum_fragment(src, dst, dim, reverse)
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.cumsum"),
-        src.access_ptr("r"),
-        dst.access_ptr("w"),
+        tir.op.Op.get("tl.tileop.cumsum"),
+        to_buffer_region(src, access_type="r"),
+        to_buffer_region(dst, access_type="w"),
         dim,
         reverse,
     )
@@ -310,7 +366,7 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
 
 def finalize_reducer(reducer: tir.Buffer):
     """
-    Finalize a reducer buffer by emitting the `tl.finalize_reducer` intrinsic.
+    Finalize a reducer buffer by emitting the `tl.tileop.finalize_reducer` intrinsic.
 
     This returns a TVM `tir.Call` handle that finalizes the given reducer using its writable pointer.
     The call does not modify Python objects directly; it produces the low-level intrinsic call used by the IR.
@@ -323,6 +379,86 @@ def finalize_reducer(reducer: tir.Buffer):
     """
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.finalize_reducer"),
-        reducer.access_ptr("w"),
+        tir.op.Op.get("tl.tileop.finalize_reducer"),
+        to_buffer_region(reducer, access_type="w"),
     )
+
+
+def warp_reduce_sum(value: tir.PrimExpr):
+    """Perform warp reduction sum on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the sum of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced sum value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_sum"), value)
+
+
+def warp_reduce_max(value: tir.PrimExpr):
+    """Perform warp reduction max on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the max of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced max value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_max"), value)
+
+
+def warp_reduce_min(value: tir.PrimExpr):
+    """Perform warp reduction min on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the min of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced min value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_min"), value)
+
+
+def warp_reduce_bitand(value: tir.PrimExpr):
+    """Perform warp reduction bitwise-and on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-and of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-and value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitand"), value)
+
+
+def warp_reduce_bitor(value: tir.PrimExpr):
+    """Perform warp reduction bitwise-or on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-or of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-or value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitor"), value)
diff --git a/tilelang/language/tir/entry.py b/tilelang/language/tir/entry.py
index 22702ae432cc6e3c05963796c3647e6021b9183b..8d65786e44163450817c41258cad6f9ae6cc6428 100644
--- a/tilelang/language/tir/entry.py
+++ b/tilelang/language/tir/entry.py
@@ -7,9 +7,7 @@ from tvm.tir.function import PrimFunc
 from tvm.script.parser._core import parse, scan_macro, utils
 
 
-def prim_func(func: Callable | None = None,
-              private: bool = False,
-              check_well_formed: bool = False) -> PrimFunc | Callable:
+def prim_func(func: Callable | None = None, private: bool = False, check_well_formed: bool = False) -> PrimFunc | Callable:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -91,12 +89,12 @@ def macro(*args, hygienic: bool = True) -> Callable:
 
 
         @T.prim_func
-        def use1(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use1(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 static_capture(A, B)    ### Produces B[()] = A[128]
 
         @T.prim_func
-        def use2(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use2(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 dynamic_capture(A, B)   ### Produces B[()] = A[x_value]
         ```
@@ -113,8 +111,7 @@ def macro(*args, hygienic: bool = True) -> Callable:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 setattr(macro, "dispatch_token", "tir")  # noqa: B010
diff --git a/tilelang/language/tir/ir.py b/tilelang/language/tir/ir.py
index cdd175f9624fcf7bc67513ea999c61f028e90fd5..7723da713c668fc7bb8fa262537751256f8def1b 100644
--- a/tilelang/language/tir/ir.py
+++ b/tilelang/language/tir/ir.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import tvm.script.ir_builder.tir.ir as _ir
 from tvm.script.ir_builder.tir import frame
 from tvm.tir import PrimExpr
@@ -7,10 +6,7 @@ import tilelang.language.tir.op as _tir_op
 import functools
 
 
-def serial(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -32,10 +28,7 @@ def serial(start: PrimExpr,
     return _ir.serial(start=start, stop=stop, annotations=annotations)
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -57,10 +50,7 @@ def parallel(start: PrimExpr,
     return _ir.parallel(start=start, stop=stop, annotations=annotations)
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -82,10 +72,7 @@ def vectorized(start: PrimExpr,
     return _ir.vectorized(start=start, stop=stop, annotations=annotations)
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -162,7 +149,6 @@ def grid(*extents: PrimExpr) -> frame.ForFrame:
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -173,7 +159,6 @@ def _dtype_forward(func):
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
diff --git a/tilelang/language/tir/ir.pyi b/tilelang/language/tir/ir.pyi
index fe25b58f85573f2cc2941470eba7acdcfaf3dd07..7723f13782bb3e40f5ee4ba3b1242ff6360eb0e6 100644
--- a/tilelang/language/tir/ir.pyi
+++ b/tilelang/language/tir/ir.pyi
@@ -1,22 +1,22 @@
 from typing import TypeVar, Literal
 from tvm.tir.expr import Span, PrimExpr, BufferLoad, Var, IntImm
 
-_T = TypeVar('_T')
+_T = TypeVar("_T")
 
-def abs(x: _T, span: Span | None=None) -> _T: ...
+def abs(x: _T, span: Span | None = None) -> _T: ...
 def acos(x: _T) -> _T: ...
 def acosh(x: _T) -> _T: ...
-def address_of(buffer_load: BufferLoad, span: Span | None=None) -> PrimExpr: ...
+def address_of(buffer_load: BufferLoad, span: Span | None = None) -> PrimExpr: ...
 def asin(x: _T) -> _T: ...
 def asinh(x: _T) -> _T: ...
 def atan(x: _T) -> _T: ...
 def atan2(x1: _T, x2: _T) -> _T: ...
 def atanh(x: _T) -> _T: ...
-def bitwise_and(x: _T, y: _T, span: Span | None=None) -> _T: ...
-def bitwise_not(x: _T, span: Span | None=None) -> _T: ...
-def bitwise_or(x: _T, y: _T, span: Span | None=None) -> _T: ...
-def bitwise_xor(x: _T, y: _T, span: Span | None=None) -> _T: ...
-def ceil(x: _T, span: Span | None=None) -> _T: ...
+def bitwise_and(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_not(x: _T, span: Span | None = None) -> _T: ...
+def bitwise_or(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_xor(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def ceil(x: _T, span: Span | None = None) -> _T: ...
 def clz(x: _T) -> _T: ...
 def copysign(x1: _T, x2: _T) -> _T: ...
 def cos(x: _T) -> _T: ...
@@ -25,35 +25,37 @@ def erf(x: _T) -> _T: ...
 def exp(x: _T) -> _T: ...
 def exp2(x: _T) -> _T: ...
 def exp10(x: _T) -> _T: ...
-def floor(x: _T, span: Span | None=None) -> _T: ...
-def ceildiv(lhs: _T, rhs: _T, span: Span | None=None) -> _T: ...
-def floordiv(a: _T, b: _T, span: Span | None=None) -> _T: ...
-def floormod(a: _T, b: _T, span: Span | None=None) -> _T: ...
+def floor(x: _T, span: Span | None = None) -> _T: ...
+def ceildiv(lhs: _T, rhs: _T, span: Span | None = None) -> _T: ...
+def floordiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def floormod(a: _T, b: _T, span: Span | None = None) -> _T: ...
 def fmod(x: _T, y: _T) -> _T: ...
 def hypot(x1: _T, x2: _T) -> _T: ...
-def if_then_else(cond: PrimExpr, t: _T, f: _T, span: Span | None=None) -> _T: ...
-def infinity(dtype: _T, span: Span | None=None) -> _T: ...
-def isfinite(x: _T, span: Span | None=None) -> _T: ...
-def isinf(x: _T, span: Span | None=None) -> _T: ...
-def isnan(x: _T, span: Span | None=None) -> _T: ...
-def isnullptr(x: _T, span: Span | None=None) -> _T: ...
+def if_then_else(cond: PrimExpr, t: _T, f: _T, span: Span | None = None) -> _T: ...
+def infinity(dtype: _T, span: Span | None = None) -> _T: ...
+def isfinite(x: _T, span: Span | None = None) -> _T: ...
+def isinf(x: _T, span: Span | None = None) -> _T: ...
+def isnan(x: _T, span: Span | None = None) -> _T: ...
+def isnullptr(x: _T, span: Span | None = None) -> _T: ...
 def ldexp(x1: _T, x2: _T) -> _T: ...
-def likely(cond: _T, span: Span | None=None) -> _T: ...
+def likely(cond: _T, span: Span | None = None) -> _T: ...
 def log(x: _T) -> _T: ...
 def log1p(x: _T) -> _T: ...
 def log2(x: _T) -> _T: ...
 def log10(x: _T) -> _T: ...
-def lookup_param(param_name: str, span: Span | None=None) -> PrimExpr: ...
-def max_value(dtype: str, span: Span | None=None) -> PrimExpr: ...
-def min_value(dtype: str, span: Span | None=None) -> PrimExpr: ...
-def nearbyint(x: _T, span: Span | None=None) -> _T: ...
+def lookup_param(param_name: str, span: Span | None = None) -> PrimExpr: ...
+def max_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def min_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def nearbyint(x: _T, span: Span | None = None) -> _T: ...
 def nextafter(x1: _T, x2: _T) -> _T: ...
 def popcount(x: _T) -> _T: ...
-def pow(x: _T, y: _T, span: Span | None=None) -> _T: ...
+def pow(x: _T, y: _T, span: Span | None = None) -> _T: ...
 def q_multiply_shift(x: _T, y: _T, q: _T, s: _T) -> _T: ...
-def q_multiply_shift_per_axis(x: _T, y: _T, ls: _T, rs: _T, q: IntImm, is_lshift_required: IntImm, is_rshift_required: IntImm) -> PrimExpr: ...
+def q_multiply_shift_per_axis(
+    x: _T, y: _T, ls: _T, rs: _T, q: IntImm, is_lshift_required: IntImm, is_rshift_required: IntImm
+) -> PrimExpr: ...
 def ret(val: _T) -> _T: ...
-def round(x: _T, span: Span | None=None) -> _T: ...
+def round(x: _T, span: Span | None = None) -> _T: ...
 def rsqrt(x: _T) -> _T: ...
 def shift_left(x: _T, y: _T, span=None) -> _T: ...
 def shift_right(x: _T, y: _T, span=None) -> _T: ...
@@ -63,14 +65,16 @@ def sinh(x: _T) -> _T: ...
 def sqrt(x: _T) -> _T: ...
 def tan(x: _T) -> _T: ...
 def tanh(x: _T) -> _T: ...
-def trunc(x: _T, span: Span | None=None) -> _T: ...
-def truncdiv(a: _T, b: _T, span: Span | None=None) -> _T: ...
-def truncmod(a: _T, b: _T, span: Span | None=None) -> _T: ...
+def trunc(x: _T, span: Span | None = None) -> _T: ...
+def truncdiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def truncmod(a: _T, b: _T, span: Span | None = None) -> _T: ...
 def tvm_access_ptr(ptype: PrimExpr, data, offset: int, extent: int, rw_mask: int) -> PrimExpr: ...
 def tvm_throw_last_error() -> _T: ...
 def tvm_stack_alloca(dtype_str: str, num: int) -> PrimExpr: ...
 def tvm_stack_make_shape(*args) -> _T: ...
-def tvm_stack_make_array(data: PrimExpr, shape: PrimExpr, strides: PrimExpr, ndim: PrimExpr, arr_dtype: PrimExpr, elem_offset) -> PrimExpr: ...
+def tvm_stack_make_array(
+    data: PrimExpr, shape: PrimExpr, strides: PrimExpr, ndim: PrimExpr, arr_dtype: PrimExpr, elem_offset
+) -> PrimExpr: ...
 def tvm_check_return(expected: int, return_unexpected: int, nested_call: PrimExpr) -> PrimExpr: ...
 def call_packed(*args, span=None) -> _T: ...
 def call_cpacked(*args, span=None) -> _T: ...
@@ -80,11 +84,47 @@ def tvm_tuple(*value) -> _T: ...
 def tvm_struct_set(arr, index: int, field: int, value: PrimExpr) -> PrimExpr: ...
 def tvm_thread_invariant(cond: _T) -> _T: ...
 def tvm_thread_allreduce(*freduce_args) -> _T: ...
-def tvm_load_matrix_sync(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, buffer_ptr: PrimExpr, stride: PrimExpr, layout: Literal['row_major', 'column_major']) -> PrimExpr: ...
-def tvm_mma_sync(fragment_d: Var, index_d: PrimExpr, fragment_a: Var, index_a: PrimExpr, fragment_b: Var, index_b: PrimExpr, fragment_c: Var, index_c: PrimExpr) -> PrimExpr: ...
-def tvm_bmma_sync(fragment_d: Var, index_d: PrimExpr, fragment_a: Var, index_a: PrimExpr, fragment_b: Var, index_b: PrimExpr, fragment_c: Var, index_c: PrimExpr) -> PrimExpr: ...
+def tvm_load_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
+def tvm_mma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
+def tvm_bmma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
 def tvm_fill_fragment(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, value: PrimExpr) -> PrimExpr: ...
-def tvm_store_matrix_sync(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, buffer_ptr: PrimExpr, stride: PrimExpr, layout: Literal['row_major', 'column_major']) -> PrimExpr: ...
+def tvm_store_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
 def ptx_wait_group(num: int) -> PrimExpr: ...
 def ptx_commit_group() -> _T: ...
 def ptx_cp_async_barrier(barrier_id: int) -> PrimExpr: ...
@@ -93,7 +133,7 @@ def ptx_arrive_barrier(barrier_id: int) -> PrimExpr: ...
 def ptx_arrive_barrier_expect_tx(barrier_id: int, byte_count: int) -> PrimExpr: ...
 def ptx_wait_barrier(barrier_id: int) -> PrimExpr: ...
 def create_barriers(barrier_count: int) -> PrimExpr: ...
-def assume(cond: _T=None) -> _T: ...
+def assume(cond: _T = None) -> _T: ...
 def undef() -> _T: ...
 def TVMBackendAllocWorkspace(device_type: int, device_id: int, nbytes: int, dtype_code_hint: int, dtype_bits_hint: int) -> PrimExpr: ...
 def TVMBackendFreeWorkspace(device_type: int, device_id: int, ptr: Var) -> PrimExpr: ...
diff --git a/tilelang/language/tir/op.py b/tilelang/language/tir/op.py
index 26e95ff0700417a794770ceacd629a73e1b24d9d..bb6d544e1778c6ae6f13e98145680ee2bea3c9a9 100644
--- a/tilelang/language/tir/op.py
+++ b/tilelang/language/tir/op.py
@@ -724,8 +724,7 @@ def tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
     return _tvm_op.tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout)
 
 
-def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                 index_c):
+def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core mma_sync operators
 
     Parameters
@@ -759,12 +758,10 @@ def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                fragment_c, index_c)
+    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
-def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                  index_c):
+def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core bmma_sync operators
 
     Parameters
@@ -798,8 +795,7 @@ def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                 fragment_c, index_c)
+    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
 def tvm_fill_fragment(fragment, m, n, k, index, value):
@@ -1121,7 +1117,6 @@ def ptx_wgmma_rs(
     scale_in_a,
     scale_in_b,
 ):
-
     return call_intrin(
         dtype,
         _tvm_op.Op.get("tl.ptx_wgmma_rs"),
@@ -1168,7 +1163,7 @@ def ptx_tcgen05_mma_ss(
      desc_val, scale_out, mask0, mask1, mask2, mask3[, enable_ws]).
     Aliases: you can also pass `ws` or `warp_specialized` (booleans) instead of `enable_ws`.
     Alternatively, use `variant="ws"` (or "default").
-    - kind_dtype: instruction kind selector (e.g., "float16" for kind::f16,
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
       "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
     """
     # Aliases precedence: if either `ws` or `warp_specialized` is provided, they override enable_ws
@@ -1229,7 +1224,7 @@ def ptx_tcgen05_mma_ts(
     Expects 13 positional arguments:
     (kind_dtype, A_ptr, A_offset, desc_b, B_offset, C_ptr, C_offset,
      desc_val, scale_out, mask0, mask1, mask2, mask3).
-    - kind_dtype: instruction kind selector (e.g., "float16" for kind::f16,
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
       "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
     """
     return call_intrin(
@@ -1345,8 +1340,7 @@ def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, sme
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr,
-                                smem_offset)
+    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset)
 
 
 def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes):
@@ -1381,8 +1375,7 @@ def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, by
     return _tvm_op.ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes)
 
 
-def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes,
-                      barrier_id):
+def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id):
     """TVM intrinsic for ptx async copy from global to shared memory using cp.async.bulk
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 
@@ -1414,8 +1407,7 @@ def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offse
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset,
-                                     bytes, barrier_id)
+    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id)
 
 
 def ptx_commit_group():
@@ -3033,8 +3025,7 @@ def q_multiply_shift_per_axis(
     z : PrimExpr
         The result.
     """
-    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required,
-                                             is_rshift_required)
+    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required, is_rshift_required)
 
 
 def shift_left(x, y, span=None):
@@ -3384,8 +3375,7 @@ def TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dt
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint,
-                                            dtype_bits_hint)
+    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dtype_bits_hint)
 
 
 def TVMBackendFreeWorkspace(device_type, device_id, ptr):
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index 8a918c3f6ad2ea353b4dd570bfbf51d8a9bdd988..7d6829419b5a21f0bbc3cde1759066fbd66f59fc 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -1,98 +1,33 @@
-from __future__ import annotations
 from tilelang import tvm as tvm
 from tvm import tir
-from tvm.tir import PrimExpr, Buffer, BufferLoad, op
+from tvm.tir import PrimExpr, BufferLoad, op
 from tilelang import language as T
 
 
 def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
-    """
-    Create a tile memory-region descriptor for a BufferLoad.
-
-    Maps access_type ('r', 'w', 'rw') to the numeric codes expected by the `tl.region` intrinsic
-    (1, 2, 3 respectively) and returns a tir.Call representing the region with the provided extents.
-
-    Parameters:
-        buffer (tir.BufferLoad): The BufferLoad that identifies the underlying buffer and indices.
-        access_type (str): One of 'r', 'w', or 'rw' indicating read, write, or read-write access.
-        *args (tir.PrimExpr): Extent expressions for each region dimension.
-
-    Returns:
-        tir.Call: A call to the `tl.region` intrinsic describing the memory region.
-
-    Raises:
-        KeyError: If access_type is not one of 'r', 'w', or 'rw'.
-    """
+    """Create a tl.region call for a BufferLoad and extents."""
     access_type = {"r": 1, "w": 2, "rw": 3}[access_type]
-    return T.call_intrin("handle", op.Op.get("tl.region"), buffer, access_type, *args)
-
-
-def buffer_to_tile_region(buffer: Buffer, access_type: str):
-    """Convert a TVM buffer to a tile region descriptor.
-
-    Args:
-        buffer (tir.Buffer): The buffer to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor covering the entire buffer
-    """
-    mins = [0 for _ in buffer.shape]
-    extents = [x for x in buffer.shape]
-    return region(T.BufferLoad(buffer, mins), access_type, *extents)
+    return T.call_intrin("handle", op.Op.get("tl.tileop.region"), buffer, access_type, *args)
 
 
 def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: list[PrimExpr]):
-    """Convert a buffer load operation to a tile region descriptor.
-
-    Args:
-        load (tir.BufferLoad): The buffer load operation
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-        extents (List[tir.PrimExpr]): List of expressions defining the region size
-
-    Returns:
-        tir.Call: A region descriptor for the loaded area
-    """
-    indices = load.indices
-
+    """Convert a BufferLoad to a tl.region call with explicit extents."""
+    indices = list(load.indices)
     if len(indices) > len(extents):
-        # (f"mismatch between indices and extents for buffer load {load}: indices = {indices}, extents = {extents}, "
-        # f"region will be expanded in the last 2 dimensions")
-        new_extents = []
-        for _ in range(len(indices) - len(extents)):
-            new_extents.append(1)
-        for extent in extents:
-            new_extents.append(extent)
-        extents = new_extents
+        extents = [tir.IntImm("int32", 1) for _ in range(len(indices) - len(extents))] + list(extents)
     assert len(indices) == len(extents), f"indices = {indices}, extents = {extents}"
     return region(load, access_type, *extents)
 
 
-def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str,
-                                 extents: list[tir.PrimExpr]):
-    """Convert a buffer region to a tile region descriptor.
-
-    Args:
-        buffer_region (tir.BufferRegion): The buffer region to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor for the specified buffer region
-    """
-    mins = [x.min for x in buffer_region.region]
-    region_extents = [x.extent for x in buffer_region.region]
-    assert len(region_extents) >= len(
-        extents
-    ), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
-
-    # Clamp extents element-wise so that the produced region respects the
-    # requested copy/fill extent, supporting dynamic PrimExpr via tir.min.
+def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str, extents: list[tir.PrimExpr]):
+    """Clamp extents and return a tl.region call."""
+    mins = [r.min for r in buffer_region.region]
+    region_extents = [r.extent for r in buffer_region.region]
+    assert len(region_extents) >= len(extents), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
     clamped_extents = [
-        tir.min(region_extents[i], extents[i]) if i < len(extents) else region_extents[i]
-        for i in range(len(region_extents))
+        tir.min(region_extents[i], extents[i]) if i < len(extents) else region_extents[i] for i in range(len(region_extents))
     ]
-
-    return region(T.BufferLoad(buffer_region.buffer, mins), access_type, *clamped_extents)
+    return region(tir.BufferLoad(buffer_region.buffer, mins), access_type, *clamped_extents)
 
 
 def index_to_coordinates(index, shape) -> list[PrimExpr]:
diff --git a/tilelang/language/v2/__init__.py b/tilelang/language/v2/__init__.py
index b86b378aeb001b3d02464e5d444a59a0f5b49752..5ed4834388c7374215ed5d0158aa11d0d6fe3ca9 100644
--- a/tilelang/language/v2/__init__.py
+++ b/tilelang/language/v2/__init__.py
@@ -1,2 +1,2 @@
-from .builder import prim_func, macro, PrimFunc  # noqa: F401
+from .builder import prim_func, macro, PrimFunc, PrimFuncCreater, Ref  # noqa: F401
 from .dtypes import *
diff --git a/tilelang/language/v2/annot.py b/tilelang/language/v2/annot.py
new file mode 100644
index 0000000000000000000000000000000000000000..bac92142ce1c21cd17a2f59a1ffc26c5d91b3d8f
--- /dev/null
+++ b/tilelang/language/v2/annot.py
@@ -0,0 +1,714 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from abc import ABC, abstractmethod
+from tvm import tir
+from tvm.ir.expr import PrimExpr
+from tvm.script.ir_builder.tir import buffer
+from typing import Any, Callable, Literal, TypeVar, Generic, TYPE_CHECKING
+
+# Python 3.9 compatibility for advanced typing features
+try:
+    from typing import ParamSpec, TypeVarTuple, Unpack, Self  # type: ignore[attr-defined]
+except Exception:  # Python < 3.10 for ParamSpec, < 3.11 for Unpack/TypeVarTuple/Self
+    from typing_extensions import ParamSpec, TypeVarTuple, Unpack, Self  # type: ignore
+
+# Compatibility for generic alias detection across Python versions
+try:
+    from typing import _GenericAlias as _TypingGenericAlias  # type: ignore[attr-defined]
+except Exception:
+    _TypingGenericAlias = None  # type: ignore
+try:
+    # Builtin generic alias type for e.g. tuple[int]
+    from types import GenericAlias as _TypesGenericAlias  # type: ignore[attr-defined]
+except Exception:
+    _TypesGenericAlias = None  # type: ignore
+
+_GenericAliasTypes = tuple(t for t in (_TypingGenericAlias, _TypesGenericAlias) if t is not None)
+if not _GenericAliasTypes:
+
+    class _DummyGenericAlias:  # type: ignore
+        pass
+
+    _GenericAliasTypes = (_DummyGenericAlias,)  # type: ignore
+from collections.abc import Sequence
+from .dtypes import AnyDType
+from . import dtypes as dt
+import tvm.script.ir_builder.tir as tb_tir
+from tvm.script.ir_builder import IRBuilder
+import torch
+import inspect
+
+_Shapes = TypeVarTuple("_Shapes")
+_Shape = ParamSpec("_Shape")
+_Stride = ParamSpec("_Stride")
+_DType = TypeVar("_DType")
+
+Scope = Literal["global", "shared.dyn", "local", "local.fragment"]
+
+
+class Annot(ABC):
+    """
+    Base class for tilelang kernel annotations
+    Tilelang kernel annotations are used to specify how to interpret each argument of the jit kernel
+
+    It provides 3 main functionalities:
+    1. determine whether the argument is a kernel argument (i.e., needs to be passed at kernel launch time)
+    2. parse the argument value into a hash key for jit caching
+    3. convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
+    """
+
+    def is_kernel_arg(self) -> bool:
+        """
+        Determine whether the argument is a kernel argument (i.e., needs to be passed at kernel launch time)
+        """
+        return False
+
+    @abstractmethod
+    def with_name(self: Self, name) -> Self:
+        pass
+
+    @abstractmethod
+    def get_key_parser(self) -> Callable[[str, Any], tuple[Any, ...]]:
+        """
+        Return a parser function that converts the argument value into a hash key for jit caching
+        """
+
+    @abstractmethod
+    def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable) -> tir.Var | tir.Buffer:
+        """
+        Convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
+        """
+
+    def promote(self) -> TIRAnnot | None:
+        """
+        Try to promote the annotation into a FixedAnnot if possible
+        Return None if not promotable
+        """
+        return None
+
+
+@dataclass
+class ArgVarTable:
+    """
+    ArgVarTable is used to manage the mapping from argument names to tir.Var objects
+    """
+
+    var_tab: dict[str, tir.Var] = field(default_factory=dict)
+    tmp_name_idx: int = 0
+
+    def get_or_create_var(self, name: str, dtype: dt.dtype) -> tir.Var:
+        if not name:
+            name = self.create_tmp_name()
+        if name not in self.var_tab:
+            self.var_tab[name] = tir.Var(name, dtype)
+        return self.var_tab[name]
+
+    def create_tmp_name(self) -> str:
+        name = f"varg_{self.tmp_name_idx}"
+        self.tmp_name_idx += 1
+        return name
+
+
+@dataclass
+class Value(Annot):
+    kind: Literal["static", "dynamic"] = "dynamic"
+    name: str | None = None
+    dtype: dt.dtype | None = dt.int32
+    value: int | tir.Var | None = None
+    creator: Callable[[], Any] | None = None
+
+    def is_kernel_arg(self) -> bool:
+        return self.kind == "dynamic"
+
+    @classmethod
+    def from_value(cls, value: Any, prefer_name: str = None) -> Value:
+        if isinstance(value, int):
+            # handle A: T.Tensor[[1024, 1024], ...]
+            return Value(kind="static", name=prefer_name, dtype=dt.int32, value=value)
+        elif isinstance(value, float):
+            return Value(kind="static", name=prefer_name, dtype=dt.float32, value=value)
+        elif isinstance(value, dt.dtype):
+            # handle A: T.float32
+            return Value(kind="dynamic", name=prefer_name, dtype=value, value=None)
+        elif isinstance(value, Value):
+            # handle A: T.dyn
+            return value
+        elif isinstance(value, TypeVar):
+            return Value(kind="static", name=value.__name__, value=None)
+        elif isinstance(value, (tir.Var, PrimExpr)):
+            # handle A: T.Tensor[[M, N, K], ...]
+            # or primexpr annotation like A: T.Tensor[[M, N * 4 +1]]
+            name = value.name if isinstance(value, tir.Var) else prefer_name
+            return Value(kind="dynamic", name=name, dtype=value.dtype, value=value)
+        elif value is Any or value is None or value is dt.dtype or isinstance(value, (type,) + _GenericAliasTypes):
+            # A # no annotation
+            # A: Any
+            # A: _T
+            # A: dt.dtype
+            # A: tuple[...]
+            return Value(kind="static", name=prefer_name, value=None)
+        else:
+            raise TypeError(f"Unsupported Value annotation: {value!r}, type: {type(value)}")
+
+    def with_name(self, name: str) -> Value:
+        return Value(kind=self.kind, name=self.name or name, dtype=self.dtype, value=self.value)
+
+    def get_key_parser(self):
+        if self.kind == "static":
+            if self.value is not None:
+                expected_value = self.value
+
+                def key_parser(name: str, target: Any):
+                    assert target == expected_value
+                    return target
+
+                return key_parser
+            else:
+                return lambda name, target: (target,)
+        else:
+            return lambda name, target: (None,)
+
+    def parse_key(self, target: Any):
+        return self.get_key_parser()(target)
+
+    def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable, create_arg: bool = True):
+        if self.kind == "static":
+            if self.value:
+                assert self.value == value, f"static value mismatch for {name}: expected {self.value}, got {value}"
+            return value
+        else:
+            name = self.name or name or vt.create_tmp_name()
+            if self.value is not None:
+                arg = self.value
+            elif self.creator is not None:
+                arg = self.creator()
+            else:
+                arg = vt.get_or_create_var(name, self.dtype)
+            return tb_tir.arg(name, arg) if create_arg else arg
+
+    def __repr__(self):
+        if self.kind == "static":
+            if self.value is not None:
+                return repr(self.value)
+            else:
+                return (str(self.name) or "$unnamed") + "$"
+        else:
+            if self.value is not None:
+                return repr(self.value)
+            elif self.creator is not None:
+                return repr(self.creator())
+            else:
+                return (str(self.name) or "$unnamed") + "$dyn"
+
+
+def _canonicalize_dtype(val: Any) -> dt.dtype | None:
+    if val == Any or val is None:
+        return None
+    if isinstance(val, TypeVar):
+        return None
+    return dt.dtype(val)
+
+
+def _canonicalize_shape(shape: Sequence[Any]) -> list[Value]:
+    if shape is None or shape is Any:
+        return None
+    return [Value.from_value(dim) for _, dim in enumerate(iterable=shape)]
+
+
+def _canonicalize_strides(strides: Sequence[Any]) -> list[Value]:
+    if strides is None or strides is Any:
+        return None
+    return [Value.from_value(dim) for _, dim in enumerate(strides)]
+
+
+def _shape_with_name(shape: Sequence[Value], base_name: str) -> list[Value]:
+    if shape is None:
+        return None
+    res = []
+    for i, dim in enumerate(shape):
+        dim = dim.with_name(f"{base_name}_{i}")
+        res.append(dim)
+    return res
+
+
+def _try_convert_static_shape(shape: Sequence[Value]):
+    if shape is None:
+        return None
+    res = []
+    for s in shape:
+        if s.kind == "static" and s.value is not None or s.kind == "dynamic" and s.value is not None:
+            res.append(s.value)
+    if len(res) == len(shape):
+        return res
+
+
+@dataclass
+class BufferAnnot(Annot):
+    shape: tuple = None
+    strides: tuple = None
+    dtype: dt.dtype = None
+
+    def is_kernel_arg(self) -> bool:
+        return True
+
+    @property
+    def scope(self):
+        return "global"
+
+    def __call__(
+        self,
+        shape: tuple[Unpack[_Shapes]],
+        dtype: _DType = "float32",
+        data=None,
+        strides=None,
+        elem_offset=None,
+        scope=None,
+        align=0,
+        offset_factor=0,
+        buffer_type="",
+        axis_separators=None,
+    ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+        return buffer(
+            shape,
+            dtype=dtype,
+            data=data,
+            strides=strides,
+            elem_offset=elem_offset,
+            scope=scope or self.scope,
+            align=align,
+            offset_factor=offset_factor,
+            buffer_type=buffer_type,
+            axis_separators=axis_separators,
+        )
+
+    def __getitem__(self, params):
+        shape, dtype = params
+        if not isinstance(shape, (tuple, list)):
+            shape = (shape,)
+        shape = _canonicalize_shape(shape)
+        dtype = _canonicalize_dtype(dtype)
+        return self.__class__(shape, strides=self.strides, dtype=dtype)
+
+    def with_name(self, name: str):
+        shape = _shape_with_name(self.shape, base_name=f"{name}_shape")
+        strides = _shape_with_name(self.strides, base_name=f"{name}_stride")
+        return self.__class__(shape, strides, self.dtype)
+
+    def get_key_parser(self):
+        raw_shapes = True
+        if self.shape is not None:
+            raw_shapes = False
+            shape_len = len(self.shape)
+            static_shape_idx = [i for i, dim in enumerate(self.shape) if dim.kind == "static"]
+            # static_fixed_shape_idx = [i for i, dim in enumerate(self.shape) if dim.kind == 'static' and dim.value is not None]
+            # static_fixed_shape_values = [dim.value for dim in self.shape if dim.kind == 'static' and dim.value is not None]
+        raw_strides = True
+        if self.strides is not None:
+            raw_strides = False
+            strides_len = len(self.strides)
+            strides_shape_idx = [i for i, dim in enumerate(self.strides) if dim.kind == "static"]
+            # static_fixed_strides_idx = [i for i, dim in enumerate(self.strides) if dim.kind == 'static' and dim.value is not None]
+            # static_fixed_strides_values = [dim.value for dim in self.strides if dim.kind == 'static' and dim.value is not None]
+        raw_dtype = True
+        if self.dtype is not None:
+            raw_dtype = False
+            expected_dtype = self.dtype
+
+        def key_parser(name: str, target: Any):
+            if isinstance(target, torch.Tensor):
+                shape = tuple(target.shape)
+                strides = tuple(target.stride())
+                dtype = dt.dtype(target.dtype)
+            elif isinstance(target, tir.Buffer):
+                shape = tuple(target.shape)
+                strides = tuple(target.strides)
+                dtype = dt.dtype(target.dtype)
+            else:
+                raise TypeError(
+                    f"Unsupported buffer argument type for argument `{name}`: expected a `torch.Tensor` or `tir.Buffer`, got {type(target)}"
+                )
+            if not raw_shapes:
+                assert len(shape) == shape_len
+                shape = tuple(shape[i] for i in static_shape_idx)
+                # shape_fixed = tuple(shape[i] for i in static_fixed_shape_idx)
+                # assert shape_fixed == static_fixed_shape_values, f"shape mismatch"
+            if not raw_strides:
+                assert len(strides) == strides_len
+                strides = tuple(strides[i] for i in strides_shape_idx)
+                # strides_fixed = tuple(strides[i] for i in static_fixed_strides_idx)
+                # assert strides_fixed == static_fixed_strides_values
+            if not raw_dtype:
+                dtype = dt.dtype(dtype)
+                if dtype != expected_dtype:
+                    raise TypeError(f"Tensor dtype mismatch for argument `{name}`, expected {expected_dtype}, got {dtype}")
+            return shape, strides, dtype
+
+        return key_parser
+
+    def parse_key(self, target: Any):
+        return self.get_key_parser()(target)
+
+    @staticmethod
+    def match_shape(shape: tuple[Value, ...], target_shape: tuple[int, ...], vt: ArgVarTable):
+        if shape is None:
+            return target_shape
+        args = []
+        for s, target in zip(shape, target_shape):
+            args.append(s.create_prim_func_arg(s.name, target, vt, create_arg=False))
+        return args
+
+    def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable):
+        if isinstance(value, tir.Buffer):
+            shape = value.shape
+            strides = value.strides
+            dtype = value.dtype
+        elif isinstance(value, torch.Tensor):
+            shape = value.shape
+            strides = value.stride()
+            dtype = dt.dtype(value.dtype)
+        else:
+            raise TypeError(f"Unsupported buffer argument type: {type(value)}")
+        shape = self.match_shape(self.shape, shape, vt)
+        strides = self.match_shape(self.strides, strides, vt)
+        arg = buffer(shape, dtype=self.dtype or dtype, strides=strides, scope=self.scope)
+        return tb_tir.arg(name, arg)
+
+    def promote(self):
+        shape = _try_convert_static_shape(self.shape)
+        strides = _try_convert_static_shape(self.strides)
+        if shape is not None and strides is not None and self.dtype is not None:
+            buf = buffer(shape, self.dtype, strides=strides, scope=self.scope)
+            return TIRAnnot(data=buf)
+
+
+class TensorAnnot(BufferAnnot):
+    @staticmethod
+    def _construct_strides(shape: tuple[Any]):
+        s, strides = 1, [1]
+        for dim in shape[:0:-1]:
+            s *= dim
+            strides.append(s)
+        return tuple(reversed(strides))
+
+    def __call__(
+        self,
+        shape: tuple[Unpack[_Shapes]],
+        dtype: _DType = "float32",
+        data=None,
+        strides=None,
+        elem_offset=None,
+        scope=None,
+        align=0,
+        offset_factor=0,
+        buffer_type="",
+        axis_separators=None,
+    ):
+        if isinstance(shape, (int, PrimExpr)):
+            shape = (shape,)
+        strides = strides or self._construct_strides(shape)
+        return super().__call__(
+            shape=shape,
+            dtype=dtype,
+            data=data,
+            strides=strides,
+            elem_offset=elem_offset,
+            scope=scope,
+            align=align,
+            offset_factor=offset_factor,
+            buffer_type=buffer_type,
+            axis_separators=axis_separators,
+        )
+
+    def promote(self):
+        shape = _try_convert_static_shape(self.shape)
+        if shape is not None and self.dtype is not None:
+            strides = self._construct_strides(shape)
+            buf = buffer(shape, self.dtype, strides=strides, scope=self.scope)
+            return TIRAnnot(data=buf)
+
+
+class StridedTensorAnnot(BufferAnnot):
+    def __call__(
+        self,
+        shape,
+        strides,
+        dtype: _DType = "float32",
+        data=None,
+        elem_offset=None,
+        scope=None,
+        align=0,
+        offset_factor=0,
+        buffer_type="",
+        axis_separators=None,
+    ):
+        return super().__call__(
+            shape=shape,
+            strides=strides,
+            dtype=dtype,
+            data=data,
+            elem_offset=elem_offset,
+            scope=scope,
+            align=align,
+            offset_factor=offset_factor,
+            buffer_type=buffer_type,
+            axis_separators=axis_separators,
+        )
+
+    def __getitem__(self, params):
+        shape, strides, dtype = params
+        shape = _canonicalize_shape(shape)
+        strides = _canonicalize_strides(strides)
+        dtype = _canonicalize_dtype(dtype)
+        return StridedTensorAnnot(shape, strides, dtype)
+
+
+class FragmentBufferAnnot(BufferAnnot):
+    @property
+    def scope(self):
+        return "local.fragment"
+
+
+class SharedBufferAnnot(BufferAnnot):
+    @property
+    def scope(self):
+        return "shared.dyn"
+
+
+class LocalBufferAnnot(BufferAnnot):
+    @property
+    def scope(self):
+        return "local"
+
+
+class DynAnnot(Value):
+    """
+    Dynamic variable annotation represents a tvm tir.Var argument
+    """
+
+    def __call__(self, dtype: AnyDType = dt.float32, name: str | None = None) -> DynAnnot:
+        return tir.Var(name, dtype)
+
+    def __getitem__(self, params):
+        if not isinstance(params, tuple):
+            params = (params,)
+        dtype = None
+        if len(params) == 1:
+            (name,) = params
+        if len(params) == 2:
+            dtype, name = params
+        dtype = _canonicalize_dtype(dtype) or dt.int32
+        return DynAnnot(kind="dynamic", dtype=dtype, name=name)
+
+
+@dataclass
+class DTypeAnnot(Annot):
+    """
+    Data type annotation ensures automatically conversion from AnyDType to dtype
+    >>> def foo(A: T.dtype): print(A)
+    >>> foo(torch.float32)
+    dtype('float32')
+    >>> foo(T.float32)
+    dtype('float32')
+    >>> foo('float32')
+    dtype('float32')
+    """
+
+    name: str | None = None
+
+    def is_kernel_arg(self) -> bool:
+        return False
+
+    def with_name(self, name):
+        return DTypeAnnot(name=name)
+
+    def get_key_parser(self):
+        return lambda name, value: (dt.dtype(value),)
+
+    def create_prim_func_arg(self, name, value, vt):
+        return dt.dtype(value)
+
+    def __repr__(self):
+        return self.name + "$dtype"
+
+
+@dataclass
+class TIRAnnot(Annot):
+    """
+    TIR annotation is used to directly pass tir.Buffer or tir.Var as kernel arguments
+    >>> def foo(A: T.Buffer((128,), T.float32)): ...
+    """
+
+    data: tir.Buffer | tir.Var
+
+    def is_kernel_arg(self) -> bool:
+        return True
+
+    def get_key_parser(self):
+        return lambda name, value: (None,)
+
+    def create_prim_func_arg(self, name, value, vt):
+        return tb_tir.arg(name, self.data)
+
+    def with_name(self, name: str):
+        IRBuilder.name(name, self.data)
+        return self
+
+    def __repr__(self):
+        return repr(self.data)
+
+
+if TYPE_CHECKING:
+
+    class Buffer(Generic[_Shape, _DType]):
+        def __init__(
+            shape: tuple[Unpack[_Shapes]],
+            dtype: _DType = "float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope=None,
+            align=0,
+            offset_factor=0,
+            buffer_type="",
+            axis_separators=None,
+        ) -> Buffer[Callable[[Unpack[_Shapes]]], _DType]: ...
+
+        @property
+        def shape(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> tuple[Unpack[_Shapes]]: ...
+
+        @property
+        def dtype(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> dt.dtype[_DType]: ...
+
+        @property
+        def strides(self) -> tuple[tir.PrimExpr]: ...
+
+        def scope(self) -> Scope: ...
+
+    class Tensor(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+        def __new__(
+            shape: tuple[Unpack[_Shapes]],
+            dtype: _DType = "float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope=None,
+            align=0,
+            offset_factor=0,
+            buffer_type="",
+            axis_separators=None,
+        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
+
+    class StridedTensor(Generic[_Shape, _Stride, _DType], Buffer[_Shape, _DType]):
+        def __new__(
+            shape: tuple[Unpack[_Shapes]],
+            strides=None,
+            dtype: _DType = "float32",
+            data=None,
+            elem_offset=None,
+            scope=None,
+            align=0,
+            offset_factor=0,
+            buffer_type="",
+            axis_separators=None,
+        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
+
+    class FragmentBuffer(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+        pass
+
+    class LocalBuffer(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+        pass
+
+    class SharedBuffer(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+        pass
+
+    class dyn(tir.Var):
+        def __new__(cls, dtype: _DType = "float32", name: str | None = None) -> dyn[_DType]: ...
+
+        @property
+        def dtype(self: dyn[_DType]) -> dt.dtype[_DType]: ...
+
+else:
+    Buffer = BufferAnnot()
+    Tensor = TensorAnnot()
+    StridedTensor = StridedTensorAnnot()
+    FragmentBuffer = FragmentBufferAnnot()
+    SharedBuffer = SharedBufferAnnot()
+    LocalBuffer = LocalBufferAnnot()
+    dyn = DynAnnot()
+
+
+@dataclass
+class FuncAnnot:
+    sig: inspect.Signature
+    arg_names: list[str]
+    annots: dict[str, Annot]
+    arg_parser: dict[str, Callable[[Any], tuple[Any, ...]]]
+    ker_arg_names: list[str]
+
+    @classmethod
+    def from_sig_annots(cls, sig: inspect.Signature, func_annots: dict[str, Any]) -> FuncAnnot:
+        annots = {}
+        arg_parser = {}
+        ker_arg_names = []
+        for param in sig.parameters.values():
+            name = param.name
+            annot = func_annots.get(name, Value("static", name))
+            if not isinstance(annot, Annot):
+                if not isinstance(annot, type) and callable(annot):
+                    annot = annot()
+                if annot is dt.dtype:
+                    annot = DTypeAnnot(name=name)
+                elif isinstance(annot, (tir.Buffer, tir.Var)):
+                    annot = TIRAnnot(data=annot)
+                else:
+                    annot = Value(kind="static", name=name)
+            annot = annot.promote() or annot
+            annots[name] = annot.with_name(name)
+            if annot.is_kernel_arg():
+                ker_arg_names.append(name)
+            arg_parser[name] = annot.get_key_parser()
+        arg_names = list(sig.parameters.keys())
+        return FuncAnnot(sig, arg_names, annots, arg_parser, ker_arg_names)
+
+    def parse_key(self, *args, **kws):
+        """
+        Parse arguments and generates the cache key for jit caching
+        """
+        args = {name: arg for name, arg in zip(self.arg_names, args)}
+        arg_dict = dict(**args, **kws)
+        parsed = []
+        for name, value in arg_dict.items():
+            key = self.arg_parser[name](name, value)
+            parsed.append((name, key))
+        return tuple(sorted(parsed))
+
+    def convert_to_kernel_args(self, *args, **kws):
+        args = {name: arg for name, arg in zip(self.arg_names, args)}
+        arg_dict = dict(**args, **kws)
+        return [arg_dict[name] for name in self.ker_arg_names]
+
+    def create_argument(self, name: str, value: Any, vt: ArgVarTable):
+        """
+        Convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
+        """
+        return self.annots[name].create_prim_func_arg(name, value, vt)
+
+    def is_all_static(self):
+        """
+        Check if all arguments are static (i.e., can be fully determined at compile time)
+        """
+        return all(isinstance(annot, TIRAnnot) for annot in self.annots.values())
+
+    def get_all_static_args(self):
+        res = {}
+        for name, annot in self.annots.items():
+            if isinstance(annot, TIRAnnot):
+                res[name] = annot.data
+        return res
+
+    def get_compile_time_unknown_args(self):
+        res = []
+        for name, annot in self.annots.items():
+            if not isinstance(annot, TIRAnnot):
+                res.append(name)
+        return res
diff --git a/tilelang/language/v2/ast.py b/tilelang/language/v2/ast.py
index cf879ee5973c651245f94d1989af65a20a2ed6d8..26c1851ebcee147c8a4ad841e004964f9511b9b3 100644
--- a/tilelang/language/v2/ast.py
+++ b/tilelang/language/v2/ast.py
@@ -4,16 +4,18 @@ from dataclasses import dataclass
 from typing import Callable, Generic, Any, Literal, TypeVar
 from contextlib import AbstractContextManager
 from collections.abc import Iterable
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 import inspect
+
 # from .utils import get_ast, get_compiled_object
 from . import utils
 
-_span_attrs = ['lineno', 'col_offset', 'end_lineno', 'end_col_offset']
+_span_attrs = ["lineno", "col_offset", "end_lineno", "end_col_offset"]
 
 
 def ast_has_span(ast: ast.AST) -> bool:
@@ -34,7 +36,6 @@ def ast_set_span(ast: ast.AST, span: tuple[int, int, int, int]):
 
 
 class QuoteVisitor(ast.NodeTransformer):
-
     def __init__(self, names: dict[str, ast.AST], passes: list[Any] | None = None, span=None):
         self.names = names
         self.passes = passes or []
@@ -76,9 +77,8 @@ def quote_expr(expr: str, **kws) -> ast.expr:
     return res.value
 
 
-Operator = Literal['Add', 'Sub', 'Mult', 'MatMult', 'Div', 'Mod', 'Pow', 'LShift', 'RShift',
-                   'BitOr', 'BitXor', 'BitAnd', 'FloorDiv']
-BoolOp = Literal['And', 'Or']
+Operator = Literal["Add", "Sub", "Mult", "MatMult", "Div", "Mod", "Pow", "LShift", "RShift", "BitOr", "BitXor", "BitAnd", "FloorDiv"]
+BoolOp = Literal["And", "Or", "Not"]
 
 
 def get_operator_name(operator: ast.operator) -> Operator:
@@ -89,84 +89,83 @@ def get_boolop_name(boolop: ast.boolop) -> BoolOp:
     return boolop.__class__.__name__
 
 
-_T = TypeVar('_T')
+_T = TypeVar("_T")
 
 
 def eval_op(op: Operator, left: Any, right: Any) -> Any:
-    if op == 'Add':
+    if op == "Add":
         return left + right
-    if op == 'Sub':
+    if op == "Sub":
         return left - right
-    if op == 'Mult':
+    if op == "Mult":
         return left * right
-    if op == 'MatMult':
+    if op == "MatMult":
         return left @ right
-    if op == 'Div':
+    if op == "Div":
         return left / right
-    if op == 'Mod':
+    if op == "Mod":
         return left % right
-    if op == 'Pow':
+    if op == "Pow":
         return left**right
-    if op == 'LShift':
+    if op == "LShift":
         return left << right
-    if op == 'RShift':
+    if op == "RShift":
         return left >> right
-    if op == 'BitOr':
+    if op == "BitOr":
         return left | right
-    if op == 'BitXor':
+    if op == "BitXor":
         return left ^ right
-    if op == 'BitAnd':
+    if op == "BitAnd":
         return left & right
-    if op == 'FloorDiv':
+    if op == "FloorDiv":
         return left // right
-    raise ValueError(f'Unknown operator: {op}')
+    raise ValueError(f"Unknown operator: {op}")
 
 
 def eval_aug_assign(op: Operator, left: Any, sl: slice, right: Any) -> Any:
-    if op == 'Add':
+    if op == "Add":
         left[sl] += right
         return left
-    if op == 'Sub':
+    if op == "Sub":
         left[sl] -= right
         return left
-    if op == 'Mult':
+    if op == "Mult":
         left[sl] *= right
         return left
-    if op == 'MatMult':
+    if op == "MatMult":
         left[sl] @= right
         return left
-    if op == 'Div':
+    if op == "Div":
         left[sl] /= right
         return left
-    if op == 'Mod':
+    if op == "Mod":
         left[sl] %= right
         return left
-    if op == 'Pow':
+    if op == "Pow":
         left[sl] **= right
         return left
-    if op == 'LShift':
+    if op == "LShift":
         left[sl] <<= right
         return left
-    if op == 'RShift':
+    if op == "RShift":
         left[sl] >>= right
         return left
-    if op == 'BitOr':
+    if op == "BitOr":
         left[sl] |= right
         return left
-    if op == 'BitXor':
+    if op == "BitXor":
         left[sl] ^= right
         return left
-    if op == 'BitAnd':
+    if op == "BitAnd":
         left[sl] &= right
         return left
-    if op == 'FloorDiv':
+    if op == "FloorDiv":
         left[sl] //= right
         return left
-    raise ValueError(f'Unknown operator: {op}')
+    raise ValueError(f"Unknown operator: {op}")
 
 
-class _empty:
-    ...
+class _empty: ...
 
 
 class BaseBuilder:
@@ -217,12 +216,14 @@ class BaseBuilder:
     def aug_assign_slice(self, op: Operator, target: Any, sl: slice, aug_value: Any):
         eval_aug_assign(op, target, sl, aug_value)
 
-    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any]) -> Any:
-        if op == 'And':
+    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any] | None = None) -> Any:
+        if op == "And":
             return left and right()
-        if op == 'Or':
+        if op == "Or":
             return left or right()
-        raise ValueError(f'Unknown boolop: {op}')
+        if op == "Not":
+            return not left
+        raise ValueError(f"Unknown boolop: {op}")
 
     def ifexp(self, cond: Any, then: Callable[[], Any], otherwise: Callable[[], Any]) -> Any:
         return then() if cond else otherwise()
@@ -247,9 +248,9 @@ class BaseBuilder:
 
 
 class DSLMutator(ast.NodeTransformer):
-
-    def __init__(self):
+    def __init__(self, closure_names: list[str]):
         self.tmp_counter = 0
+        self.closure_names = closure_names
 
     def get_tmp(self) -> str:
         name = f"__{self.tmp_counter}"
@@ -261,19 +262,13 @@ class DSLMutator(ast.NodeTransformer):
         br = self.get_tmp()
         if len(node.orelse) == 0:
             return quote(
-                f"for {br} in __tb.ctx_if(cond):\n"
-                f"  for _ in __tb.ctx_then({br}):\n"
-                "    pass\n",
+                f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n",
                 cond=node.test,
                 passes=[node.body],
                 span=node,
             )
         return quote(
-            f"for {br} in __tb.ctx_if(cond):\n"
-            f"  for _ in __tb.ctx_then({br}):\n"
-            f"    pass\n"
-            f"  for _ in __tb.ctx_else({br}):\n"
-            f"    pass\n",
+            f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n  for _ in __tb.ctx_else({br}):\n    pass\n",
             cond=node.test,
             passes=[node.body, node.orelse],
             span=node,
@@ -287,7 +282,7 @@ class DSLMutator(ast.NodeTransformer):
         if isinstance(target, ast.Name):
             return f"'{target.id}'"
         elif isinstance(target, ast.Tuple):
-            return ("(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)")
+            return "(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)"
         else:
             s = ast.unparse(target)
             raise NotImplementedError(f"Unsupported for target `{s}`")
@@ -300,8 +295,7 @@ class DSLMutator(ast.NodeTransformer):
         ast_set_span(var, ast_get_span(node.target))
         stmts = self._emit_assign_target(node.target, var)
         return quote(
-            f"for {tmp} in __tb.ctx_for(range):\n"
-            "  pass\n",
+            f"for {tmp} in __tb.ctx_for(range):\n  pass\n",
             target=node.target,
             range=node.iter,
             passes=[stmts + node.body],
@@ -316,24 +310,15 @@ class DSLMutator(ast.NodeTransformer):
         node = self.generic_visit(node)
         return quote("if __tb.ctx_break(): break", span=node)
 
-    def _emit_assign_target(self,
-                            target: ast.expr,
-                            rval: ast.expr,
-                            annot: ast.expr = None) -> list[ast.AST]:
+    def _emit_assign_target(self, target: ast.expr, rval: ast.expr, annot: ast.expr = None) -> list[ast.AST]:
         if isinstance(target, ast.Name):
             if annot is None:
-                return quote(
-                    f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
+                return quote(f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
             else:
-                return quote(
-                    f'name = __tb.bind("{target.id}", value, annot)',
-                    name=target,
-                    value=rval,
-                    annot=annot,
-                    span=target)
+                return quote(f'name = __tb.bind("{target.id}", value, annot)', name=target, value=rval, annot=annot, span=target)
         elif isinstance(target, ast.Attribute):
             s = ast.unparse(target)
-            raise NotImplementedError(f'Attribute assignment not supported yet, `{s}`')
+            raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
         elif isinstance(target, ast.Subscript):
             if annot is None:
                 return quote(
@@ -353,7 +338,6 @@ class DSLMutator(ast.NodeTransformer):
                     span=target,
                 )
         else:
-
             # flatten nested tuple into a list of (tmp_name, target)
             unpacked = []
 
@@ -371,11 +355,9 @@ class DSLMutator(ast.NodeTransformer):
                     return res
                 else:
                     s = ast.unparse(target)
-                    raise NotImplementedError(f'Attribute assignment not supported yet, `{s}`')
+                    raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
 
-            unpack_stmt = ast.Assign(
-                targets=[_visit_target(target)],
-                value=quote_expr('__tb.unwrap_value(rval)', rval=rval, span=rval))
+            unpack_stmt = ast.Assign(targets=[_visit_target(target)], value=quote_expr("__tb.unwrap_value(rval)", rval=rval, span=rval))
             ast_set_span(unpack_stmt, ast_get_span(target))
             stmts = [unpack_stmt]
             bind_lvals = []
@@ -383,8 +365,7 @@ class DSLMutator(ast.NodeTransformer):
 
             def flush_binds():
                 if bind_lvals:
-                    stmts.append(
-                        quote1(f'{", ".join(bind_lvals)}, = {", ".join(bind_rvals)},', span=target))
+                    stmts.append(quote1(f"{', '.join(bind_lvals)}, = {', '.join(bind_rvals)},", span=target))
                     bind_lvals.clear()
                     bind_rvals.clear()
 
@@ -414,15 +395,10 @@ class DSLMutator(ast.NodeTransformer):
                     bind_rvals.append(f'__tb.bind("{target.id}", {tmp})')
                 elif isinstance(target, ast.Subscript):
                     flush_binds()
-                    stmts.append(
-                        quote1(
-                            f'__tb.assign_slice(lval, slice, {tmp})',
-                            lval=target.value,
-                            slice=target.slice,
-                            span=target))
+                    stmts.append(quote1(f"__tb.assign_slice(lval, slice, {tmp})", lval=target.value, slice=target.slice, span=target))
                 else:
                     s = ast.unparse(target)
-                    raise NotImplementedError(f'Unsupported target: {s}')
+                    raise NotImplementedError(f"Unsupported target: {s}")
             flush_binds()
             return stmts
 
@@ -447,11 +423,7 @@ class DSLMutator(ast.NodeTransformer):
         target, rval = node.target, node.value
         op = get_operator_name(node.op)
         if isinstance(target, ast.Name):
-            return quote(
-                f"name = __tb.aug_assign('{op}', {target.id}, value)",
-                name=target,
-                value=rval,
-                span=node)
+            return quote(f"name = __tb.aug_assign('{op}', {target.id}, value)", name=target, value=rval, span=node)
         elif isinstance(target, ast.Subscript):
             return quote(
                 f"__tb.aug_assign_slice('{op}', lval, slice, value)",
@@ -465,16 +437,12 @@ class DSLMutator(ast.NodeTransformer):
 
     def visit_AnnAssign(self, node: ast.AnnAssign):
         node = self.generic_visit(node)
-        rval = node.value or quote_expr('__tb.empty', span=node, annot=node)
+        rval = node.value or quote_expr("__tb.empty", span=node, annot=node)
         return self._emit_assign_target(node.target, rval, annot=node.annotation)
 
     def visit_While(self, node):
         node = self.generic_visit(node)
-        return quote1(
-            "for _ in __tb.ctx_while(lambda: cond):\n  pass",
-            cond=node.test,
-            passes=[node.body],
-            span=node)
+        return quote1("for _ in __tb.ctx_while(lambda: cond):\n  pass", cond=node.test, passes=[node.body], span=node)
 
     def visit_FunctionDef(self, node: ast.FunctionDef):
         node = self.generic_visit(node)
@@ -494,9 +462,11 @@ class DSLMutator(ast.NodeTransformer):
         node.body = stmts + node.body
         node.decorator_list.clear()
         return quote1(
-            f"def {node.name}(__tb):\n"
-            "  range = __tb.override('range')\n"
-            "  pass\n"
+            f"def make_closure({', '.join(self.closure_names)}):\n"
+            f"  def {node.name}(__tb):\n"
+            "    range = __tb.override('range')\n"
+            "    pass\n"
+            f"    return {node.name}\n"
             f"  return {node.name}",
             passes=[node],
         )
@@ -514,6 +484,12 @@ class DSLMutator(ast.NodeTransformer):
             )
         return last
 
+    def visit_UnaryOp(self, node: ast.UnaryOp):
+        node = self.generic_visit(node)
+        if isinstance(node.op, ast.Not):
+            return quote_expr("__tb.boolop('Not', operand)", operand=node.operand, span=node)
+        return node
+
     def visit_Compare(self, node: ast.Compare) -> ast.expr:
         node = self.generic_visit(node)
         left = node.left
@@ -525,18 +501,14 @@ class DSLMutator(ast.NodeTransformer):
             left = comp
         last = split[-1]
         for i in reversed(range(len(split) - 1)):
-            last = quote_expr(
-                "__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
+            last = quote_expr("__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
         return last
 
     def visit_IfExp(self, node: ast.IfExp) -> ast.Expr:
         node = self.generic_visit(node)
         return quote_expr(
-            '__tb.ifexp(cond, lambda: then, lambda: otherwise)',
-            cond=node.test,
-            then=node.body,
-            otherwise=node.orelse,
-            span=node)
+            "__tb.ifexp(cond, lambda: then, lambda: otherwise)", cond=node.test, then=node.body, otherwise=node.orelse, span=node
+        )
 
     def visit_Return(self, node: ast.Return):
         node = self.generic_visit(node)
@@ -558,7 +530,7 @@ class DSLMutator(ast.NodeTransformer):
         return node
 
 
-_P = ParamSpec('_P')
+_P = ParamSpec("_P")
 
 
 @dataclass
@@ -595,7 +567,29 @@ def mutate(func: Callable[_P, _T]) -> IRGenerator[_P, _T]:
 
     tree = utils.get_ast(func)
     filename = inspect.getsourcefile(func) or inspect.getfile(func)
-    tree = DSLMutator().visit(tree)
-    fn = utils.get_compiled_object(tree, func.__name__, filename,
-                                   utils.inspect_function_capture(func))
+    nonlocals = utils.get_func_nonlocals(func)
+
+    # DSLMutator generates a function named `make_closure`
+    #   it accepts all names inside nonlocal, and returns the mutated function
+    #   this is because we must separate the closure namespace form the global namespace
+    #     if we directly inject closure variables into the global namespace,
+    #     it generates a new `globals` dict, and the dict owns all reference to the original globalns
+    #     which makes memory leak, because the original globalns cannot be freed
+    #     ```py
+    #     a = 123
+    #     def foo():
+    #       x = foo.__globals__ # OK, globals are maintained by python
+    #       x = {**foo.__globals__, } # Not OK: globals are copied, and the original globals cannot be freed
+    #       def bar(): x
+    #       return bar
+    #     ```
+    tree = DSLMutator(nonlocals.keys()).visit(tree)
+
+    make_closure = utils.get_compiled_object(
+        tree,
+        "make_closure",
+        filename,
+        func.__globals__,  # use the original globalns
+    )
+    fn = make_closure(**nonlocals)
     return IRGenerator(gen=fn, source=ast.unparse(tree))
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 90c8a8e99cc4e1d099b75a51bdd9fa70e32789c3..8e8930a1b34f1e2b421576056f532e9b270ddcec 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-
 from contextlib import contextmanager, AbstractContextManager
 from dataclasses import dataclass
 import inspect
@@ -7,18 +6,27 @@ import inspect
 from tilelang.language.kernel import KernelLaunchFrame
 from tvm_ffi.container import Map
 from tvm.ir.base import Span
+from tvm.ir.expr import Range
+from tvm.tir.stmt import BufferRegion
 from .ast import BaseBuilder, IRGenerator, eval_op, mutate
+from .utils import construct_strides
 import tvm
 from tvm.tir import Buffer
 from tvm.script.ir_builder import tir, IRBuilder
-from tvm.tir.expr import EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
+
+from tvm.tir.expr import BufferLoad, EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
 from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, Union
+from collections.abc import Sequence
+from .annot import FuncAnnot, ArgVarTable, Annot
+import pprint
+
 # Python 3.9 compatibility for ParamSpec and Self
 try:
     from typing import ParamSpec, Self
 except ImportError:  # Python < 3.11 for Self, < 3.10 for ParamSpec
     from typing_extensions import ParamSpec, Self
 from . import dtypes as dt
+from . import utils
 import threading
 import logging
 
@@ -26,12 +34,14 @@ logger = logging.getLogger(__name__)
 
 
 def unwrap_expr(expr) -> PrimExpr | int | float:
-    '''
+    """
     unwrap expr and convert it into PrimExpr like
-    '''
+    """
     if isinstance(expr, tir.meta_var):
         expr = expr.value
-    elif isinstance(expr, Buffer) and expr.scope() == 'local.var':
+    elif isinstance(expr, Ref):
+        return expr.load()
+    elif is_var(expr):
         expr = tir.BufferLoad(expr, indices=[0])
     elif isinstance(expr, (EqualOp, NotEqualOp)):
         expr = expr.asobject()
@@ -39,9 +49,9 @@ def unwrap_expr(expr) -> PrimExpr | int | float:
 
 
 def unwrap_cond(expr):
-    '''
+    """
     unwrap expr and convert to bool condition
-    '''
+    """
     expr = unwrap_expr(expr)
     if isinstance(expr, (IntImm, FloatImm, StringImm)):
         return bool(expr.value)
@@ -53,10 +63,10 @@ def unwrap_cond(expr):
         return bool(expr)
     else:
         logger.warning(
-            f"Python expression `{expr}` is used as condition in TileLang, \n"
-            "this is treated as a constant expression. ",
+            f"Python expression `{expr}` is used as condition in TileLang, \nthis is treated as a constant expression. ",
             stack_info=True,
-            stacklevel=3)
+            stacklevel=3,
+        )
         return bool(expr)
 
 
@@ -64,40 +74,35 @@ thread_local_storage = threading.local()
 
 
 class Frame:
-    '''
+    """
     Frame are virtual context managers used in frontend only
     They do not have any runtime representation in the generated TIR.
-    '''
+    """
+
+    def __enter__(self): ...
+
+    def __exit__(self, exc_type, exc_value, traceback): ...
 
-    def __enter__(self):
-        ...
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        ...
+class MacroFrame(Frame): ...
 
 
-class MacroFrame(Frame):
-    ...
+class ExitedMacroFrame(Frame): ...
 
 
-class BoolOpFrame(Frame):
-    ...
+class BoolOpFrame(Frame): ...
 
 
-class ConstIfFrame(Frame):
-    ...
+class ConstIfFrame(Frame): ...
 
 
-class BlockFrame(Frame):
-    ...
+class BlockFrame(Frame): ...
 
 
-class ContinueFrame(Frame):
-    ...
+class ContinueFrame(Frame): ...
 
 
-class BreakFrame(Frame):
-    ...
+class BreakFrame(Frame): ...
 
 
 @dataclass
@@ -108,6 +113,34 @@ class SerialForWithStep:
     annotations: dict[str, Any] | None = None
 
 
+@dataclass
+class OutTensor:
+    shape: Sequence[PrimExpr]
+    dtype: dt.dtype
+
+    @property
+    def strides(self):
+        return construct_strides(tuple(self.shape))
+
+
+@dataclass
+class Ref:
+    bufload: BufferLoad
+
+    @property
+    def buffer(self):
+        return self.bufload.buffer
+
+    def store(self, value):
+        tir.buffer_store(self.bufload.buffer, value, self.bufload.indices)
+
+    def load(self):
+        return self.bufload
+
+
+class UnrollForWithStep(SerialForWithStep): ...
+
+
 # Python 3.9 compatibility: avoid PEP 604 unions at runtime
 # Use tuple for isinstance checks and typing.Union for annotations/aliases
 ContinueOrBreak = (ContinueFrame, BreakFrame)
@@ -131,20 +164,23 @@ TIR_VAR_SCOPE_FRAME = (
 
 
 def is_var(v: Any) -> bool:
-    return isinstance(v, Buffer) and v.scope() == 'local.var'
+    return isinstance(v, Buffer) and v.scope() == "local.var"
 
 
 class Builder(BaseBuilder):
-
-    def __init__(self):
+    def __init__(self, func_annot: FuncAnnot = None):
         self.frames: list[AnyFrame] = []
         self.ir_builder = IRBuilder()
         self.name_inside_frame: dict[str, AnyFrame] = {}
+        self.macro_arg_annot = {}
+        self.func_annot = func_annot
+        self.out_idx = []
+        self.out_tensor_cnt = 0
+        self.arg_vt = ArgVarTable()
 
     @classmethod
     def current(cls) -> Self:
-        builder = thread_local_storage.builder
-        assert builder is not None, "No active Builder found in the current thread."
+        builder = getattr(thread_local_storage, "builder", None)
         return builder
 
     @contextmanager
@@ -153,18 +189,36 @@ class Builder(BaseBuilder):
         with self.ir_builder, self.with_frame(tir.prim_func()):
             tir.func_name(name)
             yield
+        if len(self.out_idx) != self.out_tensor_cnt:
+            raise RuntimeError("Not all tensor allocated from `T.empty` are returned")
 
     @contextmanager
-    def macro(self, name=None):
+    def macro(self, name=None, annotations=None):
         if self.find_frame_idx(BoolOpFrame) is not None:
             raise RuntimeError(
                 f"Macro `{name}` is used inside boolean expressions, "
-                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs")
-        save = self.name_inside_frame
+                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs"
+            )
+        save = self.name_inside_frame, self.macro_arg_annot
         self.name_inside_frame = {}
-        with self.with_frame(MacroFrame()):
-            yield
-        self.name_inside_frame = save
+        self.macro_arg_annot = annotations or {}
+        pos = len(self.frames)
+        # here we add a ExitedMacroFrame to preserve the frame stack inside macro
+        # because macro may bind some variable, and return it
+        #
+        # ```py
+        # @T.macro
+        # def foo(x):
+        #    y = x + 1
+        #    return y
+        # @T.prim_func
+        # def bar():
+        #    c = foo(1) # macro generates let y = x + 1
+        #    d = c # d = c should lay inside frame of `let y = x + 1`
+        self.frames.append(MacroFrame())
+        yield
+        self.frames[pos] = ExitedMacroFrame()
+        self.name_inside_frame, self.macro_arg_annot = save
 
     def get(self):
         return self.ir_builder.get()
@@ -182,10 +236,7 @@ class Builder(BaseBuilder):
     def check_continue_break(self):
         idx = self.find_frame_idx(ContinueOrBreak)
         if idx is not None:
-            logger.warning(
-                'Writing code after continue/break may cause undefined behavior in tilelang.',
-                stack_info=True,
-                stacklevel=3)
+            logger.warning("Writing code after continue/break may cause undefined behavior in tilelang.", stack_info=True, stacklevel=3)
 
     @contextmanager
     def with_frame(self, frame: AbstractContextManager[Any] | None):
@@ -194,8 +245,7 @@ class Builder(BaseBuilder):
         while len(self.frames) > pop_idx:
             self.frames.pop().__exit__(None, None, None)
 
-    class _has_if_frame:
-        ...
+    class _has_if_frame: ...
 
     def ctx_if(self, cond):
         self.check_continue_break()
@@ -232,7 +282,7 @@ class Builder(BaseBuilder):
         elif isinstance(val, tir.frame.IRBuilderFrame):
             if isinstance(val, tir.frame.ForFrame):
                 logger.warning(
-                    'Evaluating a for frame may cause undefined behavior in tilelang.',
+                    "Evaluating a for frame may cause undefined behavior in tilelang.",
                     stack_info=True,
                     stacklevel=1,
                 )
@@ -245,36 +295,45 @@ class Builder(BaseBuilder):
             pass
         elif isinstance(val, tvm.tir.stmt.BufferStore):
             tir.buffer_store(val.buffer, val.value, val.indices, val.predicate)
-        elif not isinstance(val, tvm.tir.Buffer):
-            raise TypeError(f"Unsupported eval value: {val} of type {type(val)}")
+        elif isinstance(val, (Buffer, Var)):
+            pass
+        else:
+            logger.warning(f"Unused return value: {val}({type(val)})", stack_info=True, stacklevel=2)
 
     def ctx_for(self, it):
         self.check_continue_break()
         it = unwrap_expr(it)
-        if isinstance(it, SerialForWithStep):
+        if isinstance(it, (SerialForWithStep, UnrollForWithStep)):
             # Validate and compute the trip count before constructing the frame
             if isinstance(it.step, (int, IntImm)):
                 step_value = it.step if isinstance(it.step, int) else it.step.value
                 if step_value == 0:
-                    raise ValueError('Invalid stepped serial: step must be non-zero')
+                    raise ValueError("Invalid stepped serial: step must be non-zero")
                 if step_value > 0:
                     real_stop = tir.ceildiv(it.stop - it.start, step_value)
                 else:
                     real_stop = tir.ceildiv(it.start - it.stop, -step_value)
             else:
-                logger.warning(
-                    f'Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang'
-                )
+                logger.warning(f"Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang")
                 real_stop = tir.ceildiv(it.stop - it.start, it.step)
-            real_frame = tir.serial(real_stop, annotations=it.annotations)
+            if isinstance(it, UnrollForWithStep):
+                real_frame = tir.unroll(real_stop, annotations=it.annotations)
+            elif isinstance(it, SerialForWithStep):
+                real_frame = tir.serial(real_stop, annotations=it.annotations)
+            else:
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding"
+                )
             with self.with_frame(real_frame) as v:
-                IRBuilder.name('_tmp', v)
+                IRBuilder.name("_tmp", v)
                 yield it.start + v * it.step
         else:
             if not isinstance(it, tir.frame.ForFrame):
                 raise TypeError(
                     f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
-                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding")
+                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding"
+                )
             with self.with_frame(it) as v:
                 yield v
 
@@ -297,15 +356,16 @@ class Builder(BaseBuilder):
         if not isinstance(cond_v_unwrap, PrimExpr):
             if cond_v_unwrap:
                 raise RuntimeError(
-                    f'Infinite while loop detected in TileLang\n'
-                    f'Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n'
+                    f"Infinite while loop detected in TileLang\n"
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n"
                 )
             else:
                 logger.warning(
-                    'While loop with constant false condition detected in Tilelang, the loop body will never be executed.\n',
-                    f'Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n',
+                    "While loop with constant false condition detected in Tilelang, the loop body will never be executed.\n",
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n",
                     stack_info=True,
-                    stacklevel=2)
+                    stacklevel=2,
+                )
         with self.with_frame(tir.While(cond_v_unwrap)):
             yield None
 
@@ -313,41 +373,50 @@ class Builder(BaseBuilder):
         self.check_continue_break()
         locals = self.get_parent_locals()
         orig_value = locals.get(name, None)
-        # annotation like tl.float32
-        # temporarily disable annotation based var declaration, for better pull request separation
-        # if callable(annot):
-        #     annot_val = annot()
-        #     if isinstance(annot_val, tir.Var):
-        #         orig_value = tir.alloc_buffer((1,), dtype=annot_val.dtype, scope='local.var')
-        #         IRBuilder.name(name, orig_value)
-        #         if isinstance(value, EllipsisType) or value is self.empty:
-        #             return orig_value
-        #         elif isinstance(value, (int, float, IntImm, FloatImm)):
-        #             tir.block_attr(
-        #                 {'tl.local_var_init': {
-        #                     orig_value.data: tvm.runtime.convert(value)
-        #                 }})
-        #             return orig_value
         # if orig_value is a local.var, we use buffer_store to modify it immutably
-        #   however, if rvalue is also a local.var, this is a new binding,
+        #   however, if rvalue is not a PrimExpr, such as buffer,
         #   we should not use buffer_store, and bind it instead
         #   ```py
         #   a = tl.alloc_var('float32')  # bind var `a`
         #   a = tl.alloc_var('float32')  # bind a new var `a_1`
+        #   a = tl.alloc_shared((1,), T.float32) # bind a to new buffer
         #   b = a                        # get value of var `b = a_1[0]``
         #   c = tl.alloc_var('float32')  # bind var `c`
         #   c = a                        # get and assign `c[0] = a_1[0]`
         #   ```
-        if is_var(orig_value) and not is_var(value):
+        if isinstance(orig_value, Ref) and isinstance(value, (int, float, PrimExpr)):
+            orig_value.store(value)
+            return orig_value
+        if is_var(orig_value) and isinstance(value, (int, float, PrimExpr)):
             tir.buffer_store(orig_value, value, 0)
             return orig_value
+
+        # 2. Quick return for trivil types
+        if isinstance(value, (tuple, list, tvm.ffi.Array, int, float, str)):
+            return value
+        if isinstance(value, tir.IntImm) and value.dtype == "int32":
+            return value.value
+        if isinstance(value, (Var, Buffer)):
+            # Bind TVM Var/Buffer names and also record scope so reusing the same
+            # Python name (e.g., loop vars like `i`) across different for-frames
+            # works without triggering out-of-scope errors.
+            IRBuilder.name(name, value)
+            if name != "_":
+                frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+                assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+                self.name_inside_frame[name] = self.frames[frame]
+            return value
+
+        # 3. Bind immutable tilelang objects
         res = self.bind_immutable(name, value)
-        if name != '_':
+
+        # 4. Check variable scope and shadowing
+        if name != "_":
             frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
             assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
             if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
                 logger.warning(
-                    f'Variable `{name}` shadows another declared value, Are you forgetting to allocate it as a var?',
+                    f"Variable `{name}` is declared twice, are you looking for a T.alloc_var?",
                     stack_info=True,
                     stacklevel=2,
                 )
@@ -355,6 +424,9 @@ class Builder(BaseBuilder):
         return res
 
     def unwrap_value(self, value):
+        """
+        Unwrap some tilelang objects to get their inner value
+        """
         value = unwrap_expr(value)
         # handle bx, by = tl.Kernel(128, 128), rval is frame
         if isinstance(value, tir.frame.IRBuilderFrame):
@@ -363,7 +435,11 @@ class Builder(BaseBuilder):
             return value
 
     def bind_immutable(self, name, value):
-        if name == '_':
+        """
+        Bind an immutable tilelang objects.
+        The immutability means the result is usually not changed or re-assigned in a python block.
+        """
+        if name == "_":
             # use _tmp to make the generated tir more readable
             name = "_tmp"
         if isinstance(value, tir.meta_var):
@@ -371,16 +447,26 @@ class Builder(BaseBuilder):
         elif isinstance(value, tir.frame.IRBuilderFrame):
             if isinstance(value, tir.frame.ForFrame):
                 logger.warning(
-                    'Binding a for frame to variable may cause undefined behavior in tilelang.',
+                    "Binding a for frame to variable may cause undefined behavior in tilelang.",
                     stack_info=True,
                     stacklevel=2,
                 )
             return self.enter_frame(value)
+        elif isinstance(value, OutTensor):
+            arg = tir.arg(
+                name,
+                tir.buffer(
+                    shape=value.shape,
+                    dtype=value.dtype,
+                    strides=value.strides,
+                ),
+            )
+            arg._out_idx = self.out_tensor_cnt
+            self.out_tensor_cnt += 1
+            return arg
         elif isinstance(value, (Buffer, tir.IterVar, tir.Var)):
             IRBuilder.name(name, value)
             return value
-        elif isinstance(value, (tuple, list, tvm.ffi.Array)):
-            return value
         else:
             try:
                 value = tvm.runtime.convert(value)
@@ -394,8 +480,7 @@ class Builder(BaseBuilder):
     def assign_slice(self, lval: Any, sl: slice, value: Any, annot=BaseBuilder.empty):
         self.check_continue_break()
         if annot is not self.empty:
-            logger.warning(
-                "Type annotation in slice assignment has no effect", stack_info=True, stacklevel=2)
+            logger.warning("Type annotation in slice assignment has no effect", stack_info=True, stacklevel=2)
         if isinstance(lval, Buffer):
             tir.buffer_store(lval, value, sl)
         else:
@@ -403,7 +488,10 @@ class Builder(BaseBuilder):
 
     def aug_assign(self, op, target, aug_value):
         self.check_continue_break()
-        if is_var(target):
+        if isinstance(target, Ref):
+            target.store(eval_op(op, target.bufload, aug_value))
+            return target
+        elif is_var(target):
             tir.buffer_store(target, eval_op(op, target[0], aug_value), 0)
             return target
         elif isinstance(target, Buffer):
@@ -418,14 +506,16 @@ class Builder(BaseBuilder):
         else:
             return super().aug_assign_slice(op, target, sl, aug_value)
 
-    def boolop(self, op, left, right):
+    def boolop(self, op, left, right=None):
         left = unwrap_cond(left)
         if isinstance(left, PrimExpr):
             with self.with_frame(BoolOpFrame()):
-                if op == 'And':
+                if op == "And":
                     return tir.And(left, right())
-                if op == 'Or':
+                if op == "Or":
                     return tir.Or(left, right())
+                if op == "Not":
+                    return tir.Not(left)
             raise RuntimeError(f"Unsupported boolean operator: {op}")
         else:
             return super().boolop(op, left, right)
@@ -438,10 +528,15 @@ class Builder(BaseBuilder):
         else:
             return super().ifexp(cond, then, otherwise)
 
-    def ret(self, value):
+    def ret(self, value=None):
         self.check_continue_break()
         # handle return T.alloc_var()
-        value = self.unwrap_value(value)
+        if value is None:
+            value = tuple()
+        elif isinstance(value, tuple):
+            value = tuple(self.unwrap_value(v) for v in value)
+        else:
+            value = self.unwrap_value(value)
         last_macro = self.find_frame_idx(MacroFrame)
         if last_macro is not None:
             frame = self.find_frame_idx(TIR_CONTROL_FRAME, start=last_macro)
@@ -451,7 +546,7 @@ class Builder(BaseBuilder):
                     "You should allocate a var before the control flow, assign value inside the blocks, \n"
                     "and return the var after the control flow. i.e.\n"
                     "```\n"
-                    "@T.macro\n" \
+                    "@T.macro\n"
                     "def my_macro(cond):\n"
                     "    a = T.alloc_var(T.float16)\n"
                     "    if cond:\n"
@@ -459,7 +554,18 @@ class Builder(BaseBuilder):
                     "    return a\n"
                     "```"
                 )
-        return value
+            return value
+        else:
+            if not isinstance(value, tuple):
+                value = (value,)
+            for v in value:
+                if not isinstance(v, Buffer) or not hasattr(v, "_out_idx"):
+                    raise RuntimeError(f"Only tensor allocated from `T.empty` can be returned in a prim_func, got {v}({type(v)})")
+                # convert 0, 1, 2 => -3, -2, -1 as the out tensor index
+                self.out_idx.append(v._out_idx - self.out_tensor_cnt)
+            if len(self.out_idx) != self.out_tensor_cnt:
+                raise RuntimeError(f"Not all tensor from `T.empty` are returned, only got {value}")
+            return NotImplemented
 
     def ctx_with(self, ctx):
         self.check_continue_break()
@@ -468,9 +574,11 @@ class Builder(BaseBuilder):
         else:
             return super().ctx_with(ctx)
 
-    def assert_expr(self, cond, msg):
+    def assert_expr(self, cond, msg=None):
         self.check_continue_break()
         cond = unwrap_cond(cond)
+        if msg is None:
+            msg = "Assertion failed"
         if isinstance(cond, PrimExpr):
             self.enter_frame(tir.Assert(cond, msg))
         elif not cond:
@@ -486,31 +594,81 @@ class Builder(BaseBuilder):
                 )
         return self.unwrap_value(value)
 
+    def macro_arg(self, name, value):
+        annot_value = self.macro_arg_annot.get(name, None)
+        if annot_value is Var or annot_value is Ref:
+            if annot_value is Var:
+                logger.warning("Use `T.Var` as macro annotations is deprecated, please use `T.Ref`")
+            if isinstance(value, BufferLoad):
+                if is_var(value.buffer):
+                    return value.buffer
+                idx = [self.bind("_", idx) for idx in value.indices]
+                # indices = self.bind(f'_', value.indices)
+                return Ref(BufferLoad(value.buffer, indices=idx))
+            if isinstance(value, BufferRegion):
+                region = [Range(self.bind("_", x.begin), end=self.bind("_", x.end) if x.end is not None else None) for x in value.region]
+                return BufferRegion(value.buffer, region=region)
+            raise ValueError(
+                f"To pass as reference, argument `{name}` is expected to be a variable or a buffer region, but got {value}({type(value)})"
+            )
+        elif isinstance(value, (PrimExpr, int, float)):
+            return self.bind(name, value)
+        else:
+            return value
+
+    def prim_func_arg(self, name, value):
+        return self.func_annot.create_argument(name, value, self.arg_vt)
+        # if isinstance(value, (Buffer, Var)):
+        #     return tir.arg(name, value)
+        # elif value is self.empty:
+        #     raise ValueError(f'Argument `{name}` is not annotated')
+        # else:
+        #     raise TypeError(
+        #         f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
+
     def arg(self, name, value):
         if self.find_frame_idx(MacroFrame) is not None:
-            if isinstance(value, (PrimExpr, int, float)):
-                return self.bind(name, value)
-            else:
-                return value
-        if isinstance(value, (Buffer, Var)):
-            return tir.arg(name, value)
-        elif value is self.empty:
-            raise ValueError(f'Argument `{name}` is not annotated')
-        # elif isinstance(value, Hashable):
-        #     return value
+            return self.macro_arg(name, value)
         else:
-            raise TypeError(
-                f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
+            return self.prim_func_arg(name, value)
 
     def override(self, name: str):
         from tilelang.language import serial
-        if name == 'range':
+
+        if name == "range":
             return serial
-        raise ValueError(f'Unknown override: {name}')
+        raise ValueError(f"Unknown override: {name}")
+
 
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+
+
+@dataclass
+class PrimFuncCreater(Generic[_P, _T]):
+    func_annot: FuncAnnot
+    ir_gen: IRGenerator[_P, _T]
+    orig_func: Callable[_P, _T]
+
+    @property
+    def annot(self) -> dict[str, Annot]:
+        return self.func_annot.annots
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_P, _T]:
+        builder = Builder(self.func_annot)
+        with builder.prim_func(self.orig_func.__name__):
+            self.ir_gen.gen(builder)(*args, **kwargs)
+        res: PrimFunc = builder.get()
+        res.ir_gen = self.ir_gen
+        res.orig_func = self.orig_func
+        res.func_annot = self.func_annot
+        res.out_idx_override = builder.out_idx or None
+        return res
+
+    def __repr__(self):
+        fmt = pprint.pformat({"annot": self.func_annot.annots, "ir_gen": self.ir_gen, "orig_func": self.orig_func}, indent=2)
+        return f"{self.__class__.__name__}(\n{fmt}\n)"
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
 
 if TYPE_CHECKING:
 
@@ -522,8 +680,10 @@ if TYPE_CHECKING:
         attrs: tvm.Attrs | None
         span: Span | None
         ir_gen: IRGenerator[_P, _T] | None
-        source: str | None
         orig_func: Callable[_P, _T] | None
+        func_annot: FuncAnnot | None
+        out_idx_override: list[int] | None
+
 else:
     PrimFunc = tvm.tir.PrimFunc
 
@@ -533,17 +693,24 @@ class Macro(Generic[_P, _T]):
     name: str
     orig_func: Callable[_P, _T]
     ir_gen: IRGenerator[_P, _T]
+    annotations: dict[str, Any]
 
     @property
     def source(self) -> str:
         return self.ir_gen.source
 
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
-        builder = Builder.current()
-        with builder.macro(self.name):
+        builder = Builder.current() or Builder()
+        with builder.macro(self.name, self.annotations):
             res = self.ir_gen.gen(builder)(*args, **kwargs)
         return res
 
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        return id(self) == id(other)
+
 
 def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
     """
@@ -578,7 +745,8 @@ def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
     """
 
     def impl(func: Callable[_P, _T]) -> Macro[_P, _T]:
-        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func))
+        annotations = get_type_hints(func)
+        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func), annotations=annotations)
 
     return impl(func) if func is not None else impl
 
@@ -587,31 +755,36 @@ from typing import _eval_type
 
 
 def get_type_hints(func):
-    annot = getattr(func, '__annotations__', None)
+    annot = getattr(func, "__annotations__", None)
     if annot is None:
-        raise TypeError(f'Failed to get function type hints, {func} is not a function')
+        raise TypeError(f"Failed to get function type hints, {func} is not a function")
     hints = {}
     # Build eval namespaces from function globals plus captured closure variables
     # This lets annotations reference symbols like `n`, `h`, or dtype vars
     # defined in the outer scope of a nested function.
-    globalns = dict(getattr(func, '__globals__', {}))
-    localns = dict(globalns)
-    try:
-        freevars = getattr(func.__code__, 'co_freevars', ())
-        cells = getattr(func, '__closure__', ()) or ()
-        closure_bindings = {
-            name: cell.cell_contents for name, cell in zip(freevars, cells) if name not in localns
-        }
-        if closure_bindings:
-            localns.update(closure_bindings)
-            # Also update globals so ForwardRef eval sees them uniformly
-            globalns.update(closure_bindings)
-    except Exception:
-        # Be permissive: absence or access issues with closure shouldn't crash
-        pass
-
+    globalns = func.__globals__
+    # Here we add nonlocals into localns, to capture the parameters declared in the parent function
+    # ```py
+    # def foo():
+    #   n = 128 # n is nonlocal
+    #   def bar(
+    #       A: T.Tensor(n, T.float32) # we add nonlocal in its eval context
+    #   ):
+    #      for i in range(n): ...
+    # ```
+    #
+    # This is incomplete and buggy
+    #   the only bug scenario the function body doesn't use the the parameters
+    #   but such define-no-use scenario is very rare in writing kernels
+    #
+    # ```py
+    # def foo():
+    #   n = 128
+    #   def bar(A: T.Tensor((n,), T.float32)):
+    #     ... # empty function, do not use `n`
+    localns = utils.get_func_nonlocals(func)
     for name, value in annot.items():
-        if name == 'return':
+        if name == "return":
             continue
         if isinstance(value, tvm.DataType):
             hints[name] = value
@@ -619,10 +792,12 @@ def get_type_hints(func):
         if value is None:
             value = type(None)
         if isinstance(value, str):
-            # Handle simple dtype aliases like T.float32 appearing as strings
-            # Evaluate directly only when it matches known dtypes
+            # if the annotation is string, is can be: (i) a T.float32 like annotations, (ii) a ForwardRef object
+            # typing doesn't handle (i), it will try to interpret T.float32
+            #    typing see: T.float32 is str('float32'), and there is no object named `flaot32` and give a NameError
+            # here we manually interpret it to return T.float32 object
             try:
-                _, v = value.split('.', maxsplit=1)
+                _, v = value.split(".", maxsplit=1)
             except ValueError:
                 v = value
             if v in dt._all_dtypes:
@@ -632,17 +807,13 @@ def get_type_hints(func):
                 except Exception:
                     pass
             value = ForwardRef(value, is_argument=True, is_class=False)
-        hints[name] = _eval_type(value, globalns=globalns, localns=localns)
+            hints[name] = _eval_type(value, globalns=globalns, localns=localns)
+        else:
+            hints[name] = value
     return hints
 
 
-def _is_static_annot(annot: Any) -> bool:
-    return isinstance(annot, (dt.dtype, Buffer, Var))
-
-
-def prim_func(func: Callable[_P, _T] = None,
-              *,
-              generator: bool = False) -> PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]:
+def prim_func(func: Callable[_P, _T] = None, *, generator: bool = False) -> PrimFunc[_P, _T] | PrimFuncCreater[_P, _T]:
     """
     Decorator to create a primitive function (PrimFunc) for TileLang IR generation.
     This decorator transforms a Python function into a TileLang primitive function by analyzing
@@ -692,45 +863,22 @@ def prim_func(func: Callable[_P, _T] = None,
         sig = inspect.signature(func)
         annot = get_type_hints(func)
 
-        for k in annot:
-            if callable(annot[k]):
-                annot[k] = annot[k]()
-
-        # check whether all arguments are annotated
-        all_arg_annotated = all([x in annot for x in sig.parameters])
-        # check whether all annotations are Buffer/Var/dtype
-        all_annot_are_static = all([_is_static_annot(x) for x in annot.values()])
+        func_annot = FuncAnnot.from_sig_annots(sig, annot)
         ir_gen = mutate(func)
 
-        def prim_func_generator(*args, **kwargs):
-            builder = Builder()
-            with builder.prim_func(func.__name__):
-                ir_gen.gen(builder)(*args, **kwargs)
-            res = builder.get()
-            res.ir_gen = ir_gen
-            res.source = ir_gen.source
-            res.orig_func = func
-            return res
-
-        prim_func_generator.ir_gen = ir_gen
-        prim_func_generator.source = ir_gen.source
-        prim_func_generator.orig_func = func
-
-        if generator:
-            return prim_func_generator
+        prim_func_generator = PrimFuncCreater(func_annot, ir_gen, orig_func=func)
 
-        if all_arg_annotated and all_annot_are_static:
-            return prim_func_generator(**annot)
+        if func_annot.is_all_static():
+            args = func_annot.get_all_static_args()
+            return prim_func_generator(**args)
         else:
-            raise ValueError(
-                "Some arguments are not supported or statically annotated, \n"
-                "please check the annotations or set generator=True to get a prim_func generator.\n"
-                f"Argument Annotations: {annot}\n"
-                "Example usage of generator:\n"
-                "```py\n"
-                "@prim_func(generator=True)\n"
-                "def my_func(a=T.Tensor((128,), T.float32)): ...\n"
-                "return my_func()\n"
-                "```")
+            if generator is False:
+                unknown_args = func_annot.get_compile_time_unknown_args()
+                raise ValueError(
+                    f"Cannot create PrimFunc for `{func.__name__}`, some arguments are not compile-time known, \n"
+                    f"Annotations:\n{func_annot.annots}"
+                    f"Unknown Args: {unknown_args}"
+                )
+            return prim_func_generator
 
     return impl(func) if func is not None else impl
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
index 2161e3770132e59b5351b18cf8931fde127423bb..a42ba5a675dae04d8decaa8281a97da6d0117085 100644
--- a/tilelang/language/v2/dtypes.py
+++ b/tilelang/language/v2/dtypes.py
@@ -1,135 +1,217 @@
 from tilelang import tvm
 from tvm import ir
 import torch
-import ctypes
-from typing import TYPE_CHECKING, Union
+from typing import Generic, TypeVar, Union, TYPE_CHECKING
 from tvm import tir
 import tvm.script.ir_builder.tir._ffi_api as tb_ffi
+import numpy as np
+
+_T = TypeVar("_T")
+
+if TYPE_CHECKING:
+
+    class dtype(Generic[_T]):
+        def as_torch(self) -> torch.dtype: ...
+else:
+    dtype = tvm.DataType
 
-dtype = tvm.DataType
 # Python 3.9 compatibility: avoid PEP 604 unions at runtime
 AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
 
-# Base dtype conversion list
-_dtype_cvt_base = [
-    (None, 'handle', ctypes.c_long, 'long', None),  # use long to repr void*
-    (bool, 'bool', ctypes.c_bool, 'bool', 'Boolean'),
-    (int, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (float, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.short, 'int16', ctypes.c_int16, 'short', 'Int16'),
-    (torch.int, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (torch.long, 'int64', ctypes.c_int64, 'long long', 'Int64'),
-    (torch.half, 'float16', None, None, 'Float16'),
-    (torch.float, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.double, 'float64', ctypes.c_double, 'double', 'Float64'),
-
-    #   (pytype,                'tvm dtype str',    'ctypes dtype',     'cffi dtype')
-    (torch.bool, 'bool', ctypes.c_bool, 'bool', 'Boolean'),
-    (torch.int8, 'int8', ctypes.c_int8, 'char', 'Int8'),
-    (torch.int16, 'int16', ctypes.c_int16, 'short', 'Int16'),
-    (torch.int32, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (torch.int64, 'int64', ctypes.c_int64, 'long long', 'Int64'),
-    (torch.uint8, 'uint8', ctypes.c_uint8, 'unsigned char', 'UInt8'),
-    (torch.uint16, 'uint16', ctypes.c_uint16, 'unsigned short', 'UInt16'),
-    (torch.uint32, 'uint32', ctypes.c_uint32, 'unsigned int', 'UInt32'),
-    (torch.uint64, 'uint64', ctypes.c_uint64, 'unsigned long long', 'UInt64'),
-    (torch.float16, 'float16', None, None, 'Float16'),
-    (torch.float32, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.float64, 'float64', ctypes.c_double, 'double', 'Float64'),
-    (None, 'float8_e4m3', None, None, 'Float8E4M3'),
-    (torch.bfloat16, 'bfloat16', None, None, 'BFloat16'),
-]
+_PYTHON_DTYPE_TO_STR = {
+    bool: "bool",
+    int: "int32",
+    float: "float32",
+}
 
-# Dynamically add fp8-related types if they exist in torch
-_fp8_dtype_mappings = [
-    ('float8_e4m3fn', 'Float8E4M3FN'),
-    ('float8_e4m3fnuz', 'Float8E4M3FNUZ'),
-    ('float8_e5m2', 'Float8E5M2'),
-    ('float8_e5m2fnuz', 'Float8E5M2FNUZ'),
-    ('float8_e8m0fnu', 'Float8E8M0FNU'),
-]
+_NUMPY_DTYPE_TO_STR = {
+    np.bool_: "bool",
+    np.short: "int16",
+    np.int_: "int64",
+    np.longlong: "int64",
+    np.half: "float16",
+    np.double: "float64",
+    np.int8: "int8",
+    np.int16: "int16",
+    np.int32: "int32",
+    np.int64: "int64",
+    np.uint8: "uint8",
+    np.uint16: "uint16",
+    np.uint32: "uint32",
+    np.uint64: "uint64",
+    np.float16: "float16",
+    np.float32: "float32",
+    np.float64: "float64",
+}
 
-_dtype_cvt = list(_dtype_cvt_base)
-for torch_attr_name, tvm_name in _fp8_dtype_mappings:
-    if hasattr(torch, torch_attr_name):
-        torch_dtype = getattr(torch, torch_attr_name)
-        _dtype_cvt.append((torch_dtype, torch_attr_name, None, None, tvm_name))
+_NUMPY_DTYPE_TO_STR.update({np.dtype(k): v for k, v in _NUMPY_DTYPE_TO_STR.items()})
 
+_TORCH_DTYPE_TO_STR = {
+    torch.bool: "bool",
+    torch.short: "int16",
+    torch.int: "int32",
+    torch.long: "int64",
+    torch.half: "float16",
+    torch.float: "float32",
+    torch.double: "float64",
+    torch.int8: "int8",
+    torch.int16: "int16",
+    torch.int32: "int32",
+    torch.int64: "int64",
+    torch.uint8: "uint8",
+    torch.uint16: "uint16",
+    torch.uint32: "uint32",
+    torch.uint64: "uint64",
+    torch.float16: "float16",
+    torch.float32: "float32",
+    torch.float64: "float64",
+    torch.bfloat16: "bfloat16",
+}
 
-def _create_type_mapper(sidx, didx, smapper=lambda x: x, dmapper=lambda x: x):
-    return {
-        smapper(item[sidx]): dmapper(item[didx])
-        for item in _dtype_cvt
-        if item[didx] is not None and item[sidx] is not None
-    }
+_extended_torch_dtypes = [
+    ("float8_e4m3fn",),
+    ("float8_e4m3fnuz",),
+    ("float8_e5m2",),
+    ("float8_e5m2fnuz",),
+    ("float8_e8m0fnu",),
+    ("float4_e2m1fnx2",),
+]
+for dtype_name_tuple in _extended_torch_dtypes:
+    dtype_name = dtype_name_tuple[0]
+    torch_dtype = getattr(torch, dtype_name, None)
+    if torch_dtype is not None:
+        _TORCH_DTYPE_TO_STR[torch_dtype] = dtype_name
 
 
-_dtype_py2tvmstr = _create_type_mapper(0, 1)
-_dtype_tvmstr2fficall = _create_type_mapper(1, 4, dmapper=lambda x: getattr(tb_ffi, x))
-_dtype_tvm2py = _create_type_mapper(1, 0, lambda x: dtype(x))
-_dtype_tvm2ctype = _create_type_mapper(1, 2, lambda x: dtype(x))
-_dtype_tvm2cffi = _create_type_mapper(1, 3, lambda x: dtype(x))
+_CANONICAL_TO_DISPLAY_STR = {
+    "double": "float64",
+    "float": "float32",
+    "int": "int32",
+    "long": "int64",
+    "short": "int16",
+    "uint": "uint32",
+    "ulong": "uint64",
+}
 
+_STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
 
-def __dtype_eq__(self: dtype, other: AnyDType):
-    if isinstance(other, str):
-        return str.__eq__(self, other)
-    if other in _dtype_py2tvmstr:
-        return str.__eq__(self, _dtype_py2tvmstr[other])
-    return NotImplemented
+# _STR_TO_NUMPY_DTYPE = {v: k for k, v in _NUMPY_DTYPE_TO_STR.items()}
 
+_DTYPE_TO_STR = {**_PYTHON_DTYPE_TO_STR, **_NUMPY_DTYPE_TO_STR, **_TORCH_DTYPE_TO_STR}
 
-def __dtype_ne__(self: dtype, other: AnyDType):
-    if isinstance(other, str):
-        return str.__ne__(self, other)
-    if other in _dtype_py2tvmstr:
-        return str.__ne__(self, _dtype_py2tvmstr[other])
-    return NotImplemented
+_STR_TO_TVM_DTYPE_CALL = {
+    "bool": "Boolean",
+    "int4": "Int4",
+    "int8": "Int8",
+    "int16": "Int16",
+    "int32": "Int32",
+    "int64": "Int64",
+    "uint8": "UInt8",
+    "uint16": "UInt16",
+    "uint32": "UInt32",
+    "uint64": "UInt64",
+    "float16": "Float16",
+    "float32": "Float32",
+    "float64": "Float64",
+    "bfloat16": "BFloat16",
+    "float8_e4m3": "Float8E4M3",
+    "float8_e4m3fn": "Float8E4M3FN",
+    "float8_e4m3fnuz": "Float8E4M3FNUZ",
+    "float8_e5m2": "Float8E5M2",
+    "float8_e5m2fnuz": "Float8E5M2FNUZ",
+    "float8_e8m0fnu": "Float8E8M0FNU",
+}
+
+int_ = int
 
 
 def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var:
-    if self in _dtype_tvmstr2fficall:
-        return _dtype_tvmstr2fficall[self](expr, is_size_var)
+    if isinstance(expr, int_):
+        return tvm.tir.const(expr, dtype=self)
+    if self in _STR_TO_TVM_DTYPE_CALL:
+        attr = _STR_TO_TVM_DTYPE_CALL[self]
+        call = getattr(tb_ffi, attr, None)
+        return call(expr, is_size_var)
     # try to construct the ffi call
-    if self.startswith('uint'):
-        val = 'UInt' + self[4:]
-    elif self.startswith('int'):
-        val = 'Int' + self[3:]
-    elif self.startswith('float'):
-        val = 'Float' + self[5:]
-    elif self.startswith('bfloat'):
-        val = 'BFloat' + self[6:]
+    if self.startswith("uint"):
+        val = "UInt" + self[4:]
+    elif self.startswith("int"):
+        val = "Int" + self[3:]
+    elif self.startswith("float"):
+        val = "Float" + self[5:]
+    elif self.startswith("bfloat"):
+        val = "BFloat" + self[6:]
     else:
-        raise TypeError(f'Invalid type {self}')
-    if '_' in val:
-        first, second = val.split('_', maxsplit=1)
+        raise TypeError(f"Invalid type {self}")
+    if "_" in val:
+        first, second = val.split("_", maxsplit=1)
         val = first + second.upper()
     call = getattr(tb_ffi, val, None)
     if call is None:
-        raise TypeError(f"Convert to datatype `{self}` is not supported by tvm\n"
-                        f"calling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`")
+        raise TypeError(
+            f"Convert to datatype `{self}` is not supported by tvm\ncalling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`"
+        )
     return call(expr, is_size_var)
 
 
+def __dtype_as_torch__(self: dtype) -> torch.dtype:
+    """Convert TileLang dtype to PyTorch dtype."""
+    dtype_str = str(self)
+
+    if dtype_str == "float8_e4m3":
+        # Check if we're on HIP (AMD ROCm) or CUDA
+        if torch.version.hip is not None:
+            # HIP backend - use float8_e4m3fnuz
+            assert hasattr(torch, "float8_e4m3fnuz"), (
+                "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+            )
+            return torch.float8_e4m3fnuz
+        else:
+            # CUDA backend - use float8_e4m3fn
+            assert hasattr(torch, "float8_e4m3fn"), (
+                "torch.float8_e4m3fn is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+            )
+            return torch.float8_e4m3fn
+    elif dtype_str == "float8_e5m2":
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+        return torch.float8_e5m2
+    elif dtype_str == "e4m3fnuz_float8":
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+        )
+        return torch.float8_e4m3fnuz
+    elif dtype_str == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif dtype_str == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fnx2
+    elif dtype_str in _STR_TO_TORCH_DTYPE:
+        return _STR_TO_TORCH_DTYPE[dtype_str]
+
+    raise ValueError(f"Cannot convert dtype '{dtype_str}' to torch.dtype. Supported dtypes: {list(_STR_TO_TORCH_DTYPE.keys())}")
+
+
 __orig_dtype_new = dtype.__new__
 
 
 def __dtype_new__(cls, value: AnyDType) -> dtype:
     if isinstance(value, str):
-        return __orig_dtype_new(cls, value)
-    elif value in _dtype_py2tvmstr:
-        return __orig_dtype_new(cls, _dtype_py2tvmstr[value])
+        return __orig_dtype_new(cls, _CANONICAL_TO_DISPLAY_STR.get(value, value))
+    elif value in _DTYPE_TO_STR:
+        return __orig_dtype_new(cls, _DTYPE_TO_STR[value])
     else:
-        expected = set(list(_dtype_py2tvmstr.keys()) + list(_dtype_tvmstr2fficall.values()))
+        expected = set(list(_DTYPE_TO_STR.keys()) + list(_DTYPE_TO_STR.values()))
         raise TypeError(f"Invalid DataType {value}({type(value)}), expect one of {expected}")
 
 
-dtype.__eq__ = __dtype_eq__
-dtype.__req__ = __dtype_eq__
-dtype.__ne__ = __dtype_ne__
-dtype.__rne__ = __dtype_ne__
 dtype.__call__ = __dtype_call__
 dtype.__new__ = __dtype_new__
+dtype.as_torch = __dtype_as_torch__
 
 
 def get_tvm_dtype(value: AnyDType) -> dtype:
@@ -139,19 +221,24 @@ def get_tvm_dtype(value: AnyDType) -> dtype:
 
 
 if TYPE_CHECKING:
-
     # yapf: disable
     class bool(dtype): ...
     class short(dtype): ...
     class int(dtype): ...
+    class uint(dtype): ...
     class long(dtype): ...
     class half(dtype): ...
     class float(dtype): ...
     class double(dtype): ...
+    class int4(dtype): ...
     class int8(dtype): ...
     class int16(dtype): ...
     class int32(dtype): ...
     class int64(dtype): ...
+    class int8x2(dtype): ...
+    class int16x2(dtype): ...
+    class int32x2(dtype): ...
+    class int64x2(dtype): ...
     class int8x4(dtype): ...
     class int16x4(dtype): ...
     class int32x4(dtype): ...
@@ -176,6 +263,10 @@ if TYPE_CHECKING:
     class uint16(dtype): ...
     class uint32(dtype): ...
     class uint64(dtype): ...
+    class uint8x2(dtype): ...
+    class uint16x2(dtype): ...
+    class uint32x2(dtype): ...
+    class uint64x2(dtype): ...
     class uint8x4(dtype): ...
     class uint16x4(dtype): ...
     class uint32x4(dtype): ...
@@ -298,320 +389,340 @@ if TYPE_CHECKING:
     # yapf: enable
 
 else:
-    bool = dtype('bool')
-    short = dtype('int16')
-    int = dtype('int32')
-    long = dtype('int64')
-    half = dtype('float16')
-    float = dtype('float32')
-    double = dtype('float64')
-    int8 = dtype('int8')
-    int16 = dtype('int16')
-    int32 = dtype('int32')
-    int64 = dtype('int64')
-    int8x4 = dtype('int8x4')
-    int16x4 = dtype('int16x4')
-    int32x4 = dtype('int32x4')
-    int64x4 = dtype('int64x4')
-    int8x8 = dtype('int8x8')
-    int16x8 = dtype('int16x8')
-    int32x8 = dtype('int32x8')
-    int64x8 = dtype('int64x8')
-    int8x16 = dtype('int8x16')
-    int16x16 = dtype('int16x16')
-    int32x16 = dtype('int32x16')
-    int64x16 = dtype('int64x16')
-    int8x32 = dtype('int8x32')
-    int16x32 = dtype('int16x32')
-    int32x32 = dtype('int32x32')
-    int64x32 = dtype('int64x32')
-    int8x64 = dtype('int8x64')
-    int16x64 = dtype('int16x64')
-    int32x64 = dtype('int32x64')
-    int64x64 = dtype('int64x64')
-    uint8 = dtype('uint8')
-    uint16 = dtype('uint16')
-    uint32 = dtype('uint32')
-    uint64 = dtype('uint64')
-    uint8x4 = dtype('uint8x4')
-    uint16x4 = dtype('uint16x4')
-    uint32x4 = dtype('uint32x4')
-    uint64x4 = dtype('uint64x4')
-    uint8x8 = dtype('uint8x8')
-    uint16x8 = dtype('uint16x8')
-    uint32x8 = dtype('uint32x8')
-    uint64x8 = dtype('uint64x8')
-    uint8x16 = dtype('uint8x16')
-    uint16x16 = dtype('uint16x16')
-    uint32x16 = dtype('uint32x16')
-    uint64x16 = dtype('uint64x16')
-    uint8x32 = dtype('uint8x32')
-    uint16x32 = dtype('uint16x32')
-    uint32x32 = dtype('uint32x32')
-    uint64x32 = dtype('uint64x32')
-    uint8x64 = dtype('uint8x64')
-    uint16x64 = dtype('uint16x64')
-    uint32x64 = dtype('uint32x64')
-    uint64x64 = dtype('uint64x64')
-    float16 = dtype('float16')
-    float32 = dtype('float32')
-    float64 = dtype('float64')
-    float16x2 = dtype('float16x2')
-    float32x2 = dtype('float32x2')
-    float64x2 = dtype('float64x2')
-    float16x4 = dtype('float16x4')
-    float32x4 = dtype('float32x4')
-    float64x4 = dtype('float64x4')
-    float16x8 = dtype('float16x8')
-    float32x8 = dtype('float32x8')
-    float64x8 = dtype('float64x8')
-    float16x16 = dtype('float16x16')
-    float32x16 = dtype('float32x16')
-    float64x16 = dtype('float64x16')
-    float16x32 = dtype('float16x32')
-    float32x32 = dtype('float32x32')
-    float64x32 = dtype('float64x32')
-    float16x64 = dtype('float16x64')
-    float32x64 = dtype('float32x64')
-    float64x64 = dtype('float64x64')
-    float8_e3m4 = dtype('float8_e3m4')
-    float8_e3m4x2 = dtype('float8_e3m4x2')
-    float8_e3m4x4 = dtype('float8_e3m4x4')
-    float8_e3m4x8 = dtype('float8_e3m4x8')
-    float8_e3m4x16 = dtype('float8_e3m4x16')
-    float8_e3m4x32 = dtype('float8_e3m4x32')
-    float8_e3m4x64 = dtype('float8_e3m4x64')
-    float8_e4m3 = dtype('float8_e4m3')
-    float8_e4m3x2 = dtype('float8_e4m3x2')
-    float8_e4m3x4 = dtype('float8_e4m3x4')
-    float8_e4m3x8 = dtype('float8_e4m3x8')
-    float8_e4m3x16 = dtype('float8_e4m3x16')
-    float8_e4m3x32 = dtype('float8_e4m3x32')
-    float8_e4m3x64 = dtype('float8_e4m3x64')
-    float8_e4m3b11fnuz = dtype('float8_e4m3b11fnuz')
-    float8_e4m3b11fnuzx2 = dtype('float8_e4m3b11fnuzx2')
-    float8_e4m3b11fnuzx4 = dtype('float8_e4m3b11fnuzx4')
-    float8_e4m3b11fnuzx8 = dtype('float8_e4m3b11fnuzx8')
-    float8_e4m3b11fnuzx16 = dtype('float8_e4m3b11fnuzx16')
-    float8_e4m3b11fnuzx32 = dtype('float8_e4m3b11fnuzx32')
-    float8_e4m3b11fnuzx64 = dtype('float8_e4m3b11fnuzx64')
-    float8_e4m3fn = dtype('float8_e4m3fn')
-    float8_e4m3fnx2 = dtype('float8_e4m3fnx2')
-    float8_e4m3fnx4 = dtype('float8_e4m3fnx4')
-    float8_e4m3fnx8 = dtype('float8_e4m3fnx8')
-    float8_e4m3fnx16 = dtype('float8_e4m3fnx16')
-    float8_e4m3fnx32 = dtype('float8_e4m3fnx32')
-    float8_e4m3fnx64 = dtype('float8_e4m3fnx64')
-    float8_e4m3fnuz = dtype('float8_e4m3fnuz')
-    float8_e4m3fnuzx2 = dtype('float8_e4m3fnuzx2')
-    float8_e4m3fnuzx4 = dtype('float8_e4m3fnuzx4')
-    float8_e4m3fnuzx8 = dtype('float8_e4m3fnuzx8')
-    float8_e4m3fnuzx16 = dtype('float8_e4m3fnuzx16')
-    float8_e4m3fnuzx32 = dtype('float8_e4m3fnuzx32')
-    float8_e4m3fnuzx64 = dtype('float8_e4m3fnuzx64')
-    float8_e5m2 = dtype('float8_e5m2')
-    float8_e5m2x2 = dtype('float8_e5m2x2')
-    float8_e5m2x4 = dtype('float8_e5m2x4')
-    float8_e5m2x8 = dtype('float8_e5m2x8')
-    float8_e5m2x16 = dtype('float8_e5m2x16')
-    float8_e5m2x32 = dtype('float8_e5m2x32')
-    float8_e5m2x64 = dtype('float8_e5m2x64')
-    float8_e5m2fnuz = dtype('float8_e5m2fnuz')
-    float8_e5m2fnuzx2 = dtype('float8_e5m2fnuzx2')
-    float8_e5m2fnuzx4 = dtype('float8_e5m2fnuzx4')
-    float8_e5m2fnuzx8 = dtype('float8_e5m2fnuzx8')
-    float8_e5m2fnuzx16 = dtype('float8_e5m2fnuzx16')
-    float8_e5m2fnuzx32 = dtype('float8_e5m2fnuzx32')
-    float8_e5m2fnuzx64 = dtype('float8_e5m2fnuzx64')
-    float8_e8m0fnu = dtype('float8_e8m0fnu')
-    float8_e8m0fnux2 = dtype('float8_e8m0fnux2')
-    float8_e8m0fnux4 = dtype('float8_e8m0fnux4')
-    float8_e8m0fnux8 = dtype('float8_e8m0fnux8')
-    float8_e8m0fnux16 = dtype('float8_e8m0fnux16')
-    float8_e8m0fnux32 = dtype('float8_e8m0fnux32')
-    float8_e8m0fnux64 = dtype('float8_e8m0fnux64')
-    float6_e2m3fn = dtype('float6_e2m3fn')
-    float6_e2m3fnx2 = dtype('float6_e2m3fnx2')
-    float6_e2m3fnx4 = dtype('float6_e2m3fnx4')
-    float6_e2m3fnx8 = dtype('float6_e2m3fnx8')
-    float6_e2m3fnx16 = dtype('float6_e2m3fnx16')
-    float6_e2m3fnx32 = dtype('float6_e2m3fnx32')
-    float6_e2m3fnx64 = dtype('float6_e2m3fnx64')
-    float6_e3m2fn = dtype('float6_e3m2fn')
-    float6_e3m2fnx2 = dtype('float6_e3m2fnx2')
-    float6_e3m2fnx4 = dtype('float6_e3m2fnx4')
-    float6_e3m2fnx8 = dtype('float6_e3m2fnx8')
-    float6_e3m2fnx16 = dtype('float6_e3m2fnx16')
-    float6_e3m2fnx32 = dtype('float6_e3m2fnx32')
-    float6_e3m2fnx64 = dtype('float6_e3m2fnx64')
-    float4_e2m1fn = dtype('float4_e2m1fn')
-    float4_e2m1fnx2 = dtype('float4_e2m1fnx2')
-    float4_e2m1fnx4 = dtype('float4_e2m1fnx4')
-    float4_e2m1fnx8 = dtype('float4_e2m1fnx8')
-    float4_e2m1fnx16 = dtype('float4_e2m1fnx16')
-    float4_e2m1fnx32 = dtype('float4_e2m1fnx32')
-    float4_e2m1fnx64 = dtype('float4_e2m1fnx64')
-    bfloat16 = dtype('bfloat16')
+    bool = dtype("bool")
+    short = dtype("int16")
+    int = dtype("int32")
+    uint = dtype("uint32")
+    long = dtype("int64")
+    half = dtype("float16")
+    float = dtype("float32")
+    double = dtype("float64")
+    int4 = dtype("int4")
+    int8 = dtype("int8")
+    int16 = dtype("int16")
+    int32 = dtype("int32")
+    int64 = dtype("int64")
+    int8x2 = dtype("int8x2")
+    int16x2 = dtype("int16x2")
+    int32x2 = dtype("int32x2")
+    int64x2 = dtype("int64x2")
+    int8x4 = dtype("int8x4")
+    int16x4 = dtype("int16x4")
+    int32x4 = dtype("int32x4")
+    int64x4 = dtype("int64x4")
+    int8x8 = dtype("int8x8")
+    int16x8 = dtype("int16x8")
+    int32x8 = dtype("int32x8")
+    int64x8 = dtype("int64x8")
+    int8x16 = dtype("int8x16")
+    int16x16 = dtype("int16x16")
+    int32x16 = dtype("int32x16")
+    int64x16 = dtype("int64x16")
+    int8x32 = dtype("int8x32")
+    int16x32 = dtype("int16x32")
+    int32x32 = dtype("int32x32")
+    int64x32 = dtype("int64x32")
+    int8x64 = dtype("int8x64")
+    int16x64 = dtype("int16x64")
+    int32x64 = dtype("int32x64")
+    int64x64 = dtype("int64x64")
+    uint8 = dtype("uint8")
+    uint16 = dtype("uint16")
+    uint32 = dtype("uint32")
+    uint64 = dtype("uint64")
+    uint8x2 = dtype("uint8x2")
+    uint16x2 = dtype("uint16x2")
+    uint32x2 = dtype("uint32x2")
+    uint64x2 = dtype("uint64x2")
+    uint8x4 = dtype("uint8x4")
+    uint16x4 = dtype("uint16x4")
+    uint32x4 = dtype("uint32x4")
+    uint64x4 = dtype("uint64x4")
+    uint8x8 = dtype("uint8x8")
+    uint16x8 = dtype("uint16x8")
+    uint32x8 = dtype("uint32x8")
+    uint64x8 = dtype("uint64x8")
+    uint8x16 = dtype("uint8x16")
+    uint16x16 = dtype("uint16x16")
+    uint32x16 = dtype("uint32x16")
+    uint64x16 = dtype("uint64x16")
+    uint8x32 = dtype("uint8x32")
+    uint16x32 = dtype("uint16x32")
+    uint32x32 = dtype("uint32x32")
+    uint64x32 = dtype("uint64x32")
+    uint8x64 = dtype("uint8x64")
+    uint16x64 = dtype("uint16x64")
+    uint32x64 = dtype("uint32x64")
+    uint64x64 = dtype("uint64x64")
+    float16 = dtype("float16")
+    float32 = dtype("float32")
+    float64 = dtype("float64")
+    float16x2 = dtype("float16x2")
+    float32x2 = dtype("float32x2")
+    float64x2 = dtype("float64x2")
+    float16x4 = dtype("float16x4")
+    float32x4 = dtype("float32x4")
+    float64x4 = dtype("float64x4")
+    float16x8 = dtype("float16x8")
+    float32x8 = dtype("float32x8")
+    float64x8 = dtype("float64x8")
+    float16x16 = dtype("float16x16")
+    float32x16 = dtype("float32x16")
+    float64x16 = dtype("float64x16")
+    float16x32 = dtype("float16x32")
+    float32x32 = dtype("float32x32")
+    float64x32 = dtype("float64x32")
+    float16x64 = dtype("float16x64")
+    float32x64 = dtype("float32x64")
+    float64x64 = dtype("float64x64")
+    float8_e3m4 = dtype("float8_e3m4")
+    float8_e3m4x2 = dtype("float8_e3m4x2")
+    float8_e3m4x4 = dtype("float8_e3m4x4")
+    float8_e3m4x8 = dtype("float8_e3m4x8")
+    float8_e3m4x16 = dtype("float8_e3m4x16")
+    float8_e3m4x32 = dtype("float8_e3m4x32")
+    float8_e3m4x64 = dtype("float8_e3m4x64")
+    float8_e4m3 = dtype("float8_e4m3")
+    float8_e4m3x2 = dtype("float8_e4m3x2")
+    float8_e4m3x4 = dtype("float8_e4m3x4")
+    float8_e4m3x8 = dtype("float8_e4m3x8")
+    float8_e4m3x16 = dtype("float8_e4m3x16")
+    float8_e4m3x32 = dtype("float8_e4m3x32")
+    float8_e4m3x64 = dtype("float8_e4m3x64")
+    float8_e4m3b11fnuz = dtype("float8_e4m3b11fnuz")
+    float8_e4m3b11fnuzx2 = dtype("float8_e4m3b11fnuzx2")
+    float8_e4m3b11fnuzx4 = dtype("float8_e4m3b11fnuzx4")
+    float8_e4m3b11fnuzx8 = dtype("float8_e4m3b11fnuzx8")
+    float8_e4m3b11fnuzx16 = dtype("float8_e4m3b11fnuzx16")
+    float8_e4m3b11fnuzx32 = dtype("float8_e4m3b11fnuzx32")
+    float8_e4m3b11fnuzx64 = dtype("float8_e4m3b11fnuzx64")
+    float8_e4m3fn = dtype("float8_e4m3fn")
+    float8_e4m3fnx2 = dtype("float8_e4m3fnx2")
+    float8_e4m3fnx4 = dtype("float8_e4m3fnx4")
+    float8_e4m3fnx8 = dtype("float8_e4m3fnx8")
+    float8_e4m3fnx16 = dtype("float8_e4m3fnx16")
+    float8_e4m3fnx32 = dtype("float8_e4m3fnx32")
+    float8_e4m3fnx64 = dtype("float8_e4m3fnx64")
+    float8_e4m3fnuz = dtype("float8_e4m3fnuz")
+    float8_e4m3fnuzx2 = dtype("float8_e4m3fnuzx2")
+    float8_e4m3fnuzx4 = dtype("float8_e4m3fnuzx4")
+    float8_e4m3fnuzx8 = dtype("float8_e4m3fnuzx8")
+    float8_e4m3fnuzx16 = dtype("float8_e4m3fnuzx16")
+    float8_e4m3fnuzx32 = dtype("float8_e4m3fnuzx32")
+    float8_e4m3fnuzx64 = dtype("float8_e4m3fnuzx64")
+    float8_e5m2 = dtype("float8_e5m2")
+    float8_e5m2x2 = dtype("float8_e5m2x2")
+    float8_e5m2x4 = dtype("float8_e5m2x4")
+    float8_e5m2x8 = dtype("float8_e5m2x8")
+    float8_e5m2x16 = dtype("float8_e5m2x16")
+    float8_e5m2x32 = dtype("float8_e5m2x32")
+    float8_e5m2x64 = dtype("float8_e5m2x64")
+    float8_e5m2fnuz = dtype("float8_e5m2fnuz")
+    float8_e5m2fnuzx2 = dtype("float8_e5m2fnuzx2")
+    float8_e5m2fnuzx4 = dtype("float8_e5m2fnuzx4")
+    float8_e5m2fnuzx8 = dtype("float8_e5m2fnuzx8")
+    float8_e5m2fnuzx16 = dtype("float8_e5m2fnuzx16")
+    float8_e5m2fnuzx32 = dtype("float8_e5m2fnuzx32")
+    float8_e5m2fnuzx64 = dtype("float8_e5m2fnuzx64")
+    float8_e8m0fnu = dtype("float8_e8m0fnu")
+    float8_e8m0fnux2 = dtype("float8_e8m0fnux2")
+    float8_e8m0fnux4 = dtype("float8_e8m0fnux4")
+    float8_e8m0fnux8 = dtype("float8_e8m0fnux8")
+    float8_e8m0fnux16 = dtype("float8_e8m0fnux16")
+    float8_e8m0fnux32 = dtype("float8_e8m0fnux32")
+    float8_e8m0fnux64 = dtype("float8_e8m0fnux64")
+    float6_e2m3fn = dtype("float6_e2m3fn")
+    float6_e2m3fnx2 = dtype("float6_e2m3fnx2")
+    float6_e2m3fnx4 = dtype("float6_e2m3fnx4")
+    float6_e2m3fnx8 = dtype("float6_e2m3fnx8")
+    float6_e2m3fnx16 = dtype("float6_e2m3fnx16")
+    float6_e2m3fnx32 = dtype("float6_e2m3fnx32")
+    float6_e2m3fnx64 = dtype("float6_e2m3fnx64")
+    float6_e3m2fn = dtype("float6_e3m2fn")
+    float6_e3m2fnx2 = dtype("float6_e3m2fnx2")
+    float6_e3m2fnx4 = dtype("float6_e3m2fnx4")
+    float6_e3m2fnx8 = dtype("float6_e3m2fnx8")
+    float6_e3m2fnx16 = dtype("float6_e3m2fnx16")
+    float6_e3m2fnx32 = dtype("float6_e3m2fnx32")
+    float6_e3m2fnx64 = dtype("float6_e3m2fnx64")
+    float4_e2m1fn = dtype("float4_e2m1fn")
+    float4_e2m1fnx2 = dtype("float4_e2m1fnx2")
+    float4_e2m1fnx4 = dtype("float4_e2m1fnx4")
+    float4_e2m1fnx8 = dtype("float4_e2m1fnx8")
+    float4_e2m1fnx16 = dtype("float4_e2m1fnx16")
+    float4_e2m1fnx32 = dtype("float4_e2m1fnx32")
+    float4_e2m1fnx64 = dtype("float4_e2m1fnx64")
+    bfloat16 = dtype("bfloat16")
 
 _all_dtypes = {
-    'bool',
-    'short',
-    'int',
-    'long',
-    'half',
-    'float',
-    'double',
-    'int8',
-    'int16',
-    'int32',
-    'int64',
-    'int8x4',
-    'int16x4',
-    'int32x4',
-    'int64x4',
-    'int8x8',
-    'int16x8',
-    'int32x8',
-    'int64x8',
-    'int8x16',
-    'int16x16',
-    'int32x16',
-    'int64x16',
-    'int8x32',
-    'int16x32',
-    'int32x32',
-    'int64x32',
-    'int8x64',
-    'int16x64',
-    'int32x64',
-    'int64x64',
-    'uint8',
-    'uint16',
-    'uint32',
-    'uint64',
-    'uint8x4',
-    'uint16x4',
-    'uint32x4',
-    'uint64x4',
-    'uint8x8',
-    'uint16x8',
-    'uint32x8',
-    'uint64x8',
-    'uint8x16',
-    'uint16x16',
-    'uint32x16',
-    'uint64x16',
-    'uint8x32',
-    'uint16x32',
-    'uint32x32',
-    'uint64x32',
-    'uint8x64',
-    'uint16x64',
-    'uint32x64',
-    'uint64x64',
-    'float16',
-    'float32',
-    'float64',
-    'float16x2',
-    'float32x2',
-    'float64x2',
-    'float16x4',
-    'float32x4',
-    'float64x4',
-    'float16x8',
-    'float32x8',
-    'float64x8',
-    'float16x16',
-    'float32x16',
-    'float64x16',
-    'float16x32',
-    'float32x32',
-    'float64x32',
-    'float16x64',
-    'float32x64',
-    'float64x64',
-    'float8_e3m4',
-    'float8_e3m4x2',
-    'float8_e3m4x4',
-    'float8_e3m4x8',
-    'float8_e3m4x16',
-    'float8_e3m4x32',
-    'float8_e3m4x64',
-    'float8_e4m3',
-    'float8_e4m3x2',
-    'float8_e4m3x4',
-    'float8_e4m3x8',
-    'float8_e4m3x16',
-    'float8_e4m3x32',
-    'float8_e4m3x64',
-    'float8_e4m3b11fnuz',
-    'float8_e4m3b11fnuzx2',
-    'float8_e4m3b11fnuzx4',
-    'float8_e4m3b11fnuzx8',
-    'float8_e4m3b11fnuzx16',
-    'float8_e4m3b11fnuzx32',
-    'float8_e4m3b11fnuzx64',
-    'float8_e4m3fn',
-    'float8_e4m3fnx2',
-    'float8_e4m3fnx4',
-    'float8_e4m3fnx8',
-    'float8_e4m3fnx16',
-    'float8_e4m3fnx32',
-    'float8_e4m3fnx64',
-    'float8_e4m3fnuz',
-    'float8_e4m3fnuzx2',
-    'float8_e4m3fnuzx4',
-    'float8_e4m3fnuzx8',
-    'float8_e4m3fnuzx16',
-    'float8_e4m3fnuzx32',
-    'float8_e4m3fnuzx64',
-    'float8_e5m2',
-    'float8_e5m2x2',
-    'float8_e5m2x4',
-    'float8_e5m2x8',
-    'float8_e5m2x16',
-    'float8_e5m2x32',
-    'float8_e5m2x64',
-    'float8_e5m2fnuz',
-    'float8_e5m2fnuzx2',
-    'float8_e5m2fnuzx4',
-    'float8_e5m2fnuzx8',
-    'float8_e5m2fnuzx16',
-    'float8_e5m2fnuzx32',
-    'float8_e5m2fnuzx64',
-    'float8_e8m0fnu',
-    'float8_e8m0fnux2',
-    'float8_e8m0fnux4',
-    'float8_e8m0fnux8',
-    'float8_e8m0fnux16',
-    'float8_e8m0fnux32',
-    'float8_e8m0fnux64',
-    'float6_e2m3fn',
-    'float6_e2m3fnx2',
-    'float6_e2m3fnx4',
-    'float6_e2m3fnx8',
-    'float6_e2m3fnx16',
-    'float6_e2m3fnx32',
-    'float6_e2m3fnx64',
-    'float6_e3m2fn',
-    'float6_e3m2fnx2',
-    'float6_e3m2fnx4',
-    'float6_e3m2fnx8',
-    'float6_e3m2fnx16',
-    'float6_e3m2fnx32',
-    'float6_e3m2fnx64',
-    'float4_e2m1fn',
-    'float4_e2m1fnx2',
-    'float4_e2m1fnx4',
-    'float4_e2m1fnx8',
-    'float4_e2m1fnx16',
-    'float4_e2m1fnx32',
-    'float4_e2m1fnx64',
-    'bfloat16',
+    "bool",
+    "short",
+    "int",
+    "uint",
+    "long",
+    "half",
+    "float",
+    "double",
+    "int4",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "int8x2",
+    "int16x2",
+    "int32x2",
+    "int64x2",
+    "int8x4",
+    "int16x4",
+    "int32x4",
+    "int64x4",
+    "int8x8",
+    "int16x8",
+    "int32x8",
+    "int64x8",
+    "int8x16",
+    "int16x16",
+    "int32x16",
+    "int64x16",
+    "int8x32",
+    "int16x32",
+    "int32x32",
+    "int64x32",
+    "int8x64",
+    "int16x64",
+    "int32x64",
+    "int64x64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "uint8x2",
+    "uint16x2",
+    "uint32x2",
+    "uint64x2",
+    "uint8x4",
+    "uint16x4",
+    "uint32x4",
+    "uint64x4",
+    "uint8x8",
+    "uint16x8",
+    "uint32x8",
+    "uint64x8",
+    "uint8x16",
+    "uint16x16",
+    "uint32x16",
+    "uint64x16",
+    "uint8x32",
+    "uint16x32",
+    "uint32x32",
+    "uint64x32",
+    "uint8x64",
+    "uint16x64",
+    "uint32x64",
+    "uint64x64",
+    "float16",
+    "float32",
+    "float64",
+    "float16x2",
+    "float32x2",
+    "float64x2",
+    "float16x4",
+    "float32x4",
+    "float64x4",
+    "float16x8",
+    "float32x8",
+    "float64x8",
+    "float16x16",
+    "float32x16",
+    "float64x16",
+    "float16x32",
+    "float32x32",
+    "float64x32",
+    "float16x64",
+    "float32x64",
+    "float64x64",
+    "float8_e3m4",
+    "float8_e3m4x2",
+    "float8_e3m4x4",
+    "float8_e3m4x8",
+    "float8_e3m4x16",
+    "float8_e3m4x32",
+    "float8_e3m4x64",
+    "float8_e4m3",
+    "float8_e4m3x2",
+    "float8_e4m3x4",
+    "float8_e4m3x8",
+    "float8_e4m3x16",
+    "float8_e4m3x32",
+    "float8_e4m3x64",
+    "float8_e4m3b11fnuz",
+    "float8_e4m3b11fnuzx2",
+    "float8_e4m3b11fnuzx4",
+    "float8_e4m3b11fnuzx8",
+    "float8_e4m3b11fnuzx16",
+    "float8_e4m3b11fnuzx32",
+    "float8_e4m3b11fnuzx64",
+    "float8_e4m3fn",
+    "float8_e4m3fnx2",
+    "float8_e4m3fnx4",
+    "float8_e4m3fnx8",
+    "float8_e4m3fnx16",
+    "float8_e4m3fnx32",
+    "float8_e4m3fnx64",
+    "float8_e4m3fnuz",
+    "float8_e4m3fnuzx2",
+    "float8_e4m3fnuzx4",
+    "float8_e4m3fnuzx8",
+    "float8_e4m3fnuzx16",
+    "float8_e4m3fnuzx32",
+    "float8_e4m3fnuzx64",
+    "float8_e5m2",
+    "float8_e5m2x2",
+    "float8_e5m2x4",
+    "float8_e5m2x8",
+    "float8_e5m2x16",
+    "float8_e5m2x32",
+    "float8_e5m2x64",
+    "float8_e5m2fnuz",
+    "float8_e5m2fnuzx2",
+    "float8_e5m2fnuzx4",
+    "float8_e5m2fnuzx8",
+    "float8_e5m2fnuzx16",
+    "float8_e5m2fnuzx32",
+    "float8_e5m2fnuzx64",
+    "float8_e8m0fnu",
+    "float8_e8m0fnux2",
+    "float8_e8m0fnux4",
+    "float8_e8m0fnux8",
+    "float8_e8m0fnux16",
+    "float8_e8m0fnux32",
+    "float8_e8m0fnux64",
+    "float6_e2m3fn",
+    "float6_e2m3fnx2",
+    "float6_e2m3fnx4",
+    "float6_e2m3fnx8",
+    "float6_e2m3fnx16",
+    "float6_e2m3fnx32",
+    "float6_e2m3fnx64",
+    "float6_e3m2fn",
+    "float6_e3m2fnx2",
+    "float6_e3m2fnx4",
+    "float6_e3m2fnx8",
+    "float6_e3m2fnx16",
+    "float6_e3m2fnx32",
+    "float6_e3m2fnx64",
+    "float4_e2m1fn",
+    "float4_e2m1fnx2",
+    "float4_e2m1fnx4",
+    "float4_e2m1fnx8",
+    "float4_e2m1fnx16",
+    "float4_e2m1fnx32",
+    "float4_e2m1fnx64",
+    "bfloat16",
 }
 
 __all__ = list(_all_dtypes) + [
-    'dtype',
-    'AnyDType',
-    'get_tvm_dtype',
+    "dtype",
+    "AnyDType",
+    "get_tvm_dtype",
 ]
diff --git a/tilelang/language/v2/utils.py b/tilelang/language/v2/utils.py
index 739ecd1eb673c927f6d13766d625fca4127ac431..207bd92ad3290fdad79a3c77ece32c9698163751 100644
--- a/tilelang/language/v2/utils.py
+++ b/tilelang/language/v2/utils.py
@@ -4,6 +4,7 @@ import inspect
 from typing import Any, Callable, Literal
 from tilelang import env
 from hashlib import sha256
+from tvm import tir
 import linecache
 
 
@@ -11,11 +12,12 @@ def disk_compile(source, name):
     cache_dir = env.TILELANG_CACHE_DIR
     if cache_dir is not None:
         import os
+
         save_dir = os.path.join(cache_dir, "py-cache")
         os.makedirs(save_dir, exist_ok=True)
-        hash_sfx = sha256(source.encode('utf-8')).hexdigest()[:8]
+        hash_sfx = sha256(source.encode("utf-8")).hexdigest()[:8]
         path = os.path.join(save_dir, f"{name}.{hash_sfx}.py")
-        with open(path, 'w') as f:
+        with open(path, "w") as f:
             f.write(source)
     linecache.cache[path] = (len(source), None, source.splitlines(), path)
     return compile(source, path, "exec")
@@ -53,54 +55,44 @@ def get_func_nonlocals(func):
     return nonlocal_vars
 
 
-def inspect_function_capture(func: Callable) -> dict[str, Any]:
-    """Capture function non-locals and global variables.
-
-    Parameters
-    ----------
-    func : Callable
-        The function to inspect.
-
-    Returns
-    -------
-    res : Dict[str, Any]
-        The function variables map with non-local or global variables.
-    """
-    captured = {
-        **func.__globals__,  # type: ignore
-        **get_func_nonlocals(func),
-    }
-    return captured
-
-
 def get_ast(func: Callable):
     _, start = inspect.getsourcelines(func)
     filename = inspect.getsourcefile(func) or inspect.getfile(func)
     source = inspect.getsource(func)
     source = _remove_leading_ident(source)
-    source = '\n' * (start - 1) + source
+    source = "\n" * (start - 1) + source
     tree = ast.parse(source, filename=filename)
     return tree
 
 
-CompileMethod = Literal['direct', 'disk']
+CompileMethod = Literal["direct", "disk"]
 
 
-def get_compiled_object(source: str | ast.AST,
-                        name: str,
-                        filename: str = None,
-                        globals: dict[str, Any] = None):
+def get_compiled_object(source: str | ast.AST, name: str, filename: str = None, globals: dict[str, Any] = None):
     if isinstance(source, ast.AST):
         assert filename is not None, "filename must be provided when source is an AST"
     try:
         if isinstance(source, ast.AST):
             ast.fix_missing_locations(source)
-            compiled = compile(source, filename, 'exec')
+            compiled = compile(source, filename, "exec")
         else:
             compiled = disk_compile(source, name)
     except Exception as e:
         source_str = source if isinstance(source, str) else ast.unparse(source)
-        raise RuntimeError(f'Failed to compile source for {name}, Error: {e}:\n{source_str}') from e
+        raise RuntimeError(f"Failed to compile source for {name}, Error: {e}:\n{source_str}") from e
     locs = {}
     exec(compiled, globals, locs)
     return locs[name]
+
+
+def construct_strides(shape: tuple[Any, ...], allow_prim_expr: bool = True) -> tuple[Any, ...]:
+    """Construct row-major strides from shape."""
+    strides = []
+    stride = 1
+    for s in shape[::-1]:
+        strides.append(stride)
+        stride *= s
+        if not allow_prim_expr and isinstance(stride, tir.PrimExpr):
+            raise ValueError("Cannot construct strides with PrimExpr when allow_prim_expr is False.")
+    strides = tuple(reversed(strides))
+    return strides
diff --git a/tilelang/language/warpgroup.py b/tilelang/language/warpgroup.py
index 872d30010e974b50c70381a41210f7f9f1e22237..77cf6924583262aa4d34b708f1ade38a64347477 100644
--- a/tilelang/language/warpgroup.py
+++ b/tilelang/language/warpgroup.py
@@ -1,5 +1,4 @@
 """The language interface for tl programs."""
-from __future__ import annotations
 
 from tvm.script.ir_builder.tir.frame import TIRFrame
 from tvm.ffi import register_object
diff --git a/tilelang/layout/__init__.py b/tilelang/layout/__init__.py
index ee513257f380ba2e0d041488d05a8d3ef5e22026..777802d2c78be3defa35d697440552161cd3d422 100644
--- a/tilelang/layout/__init__.py
+++ b/tilelang/layout/__init__.py
@@ -13,4 +13,4 @@ from .swizzle import (
     make_quarter_bank_swizzled_layout,  # noqa: F401
     make_linear_layout,  # noqa: F401
 )
-from .gemm_sp import make_metadata_layout  # noqa: F401
+from .gemm_sp import make_cutlass_metadata_layout  # noqa: F401
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index 06fc7a98751c3fef0b697d6581ae8f8b9a10154c..256a7d5ee169c1545c38ad192131ed115810a2a8 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -1,7 +1,6 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
-from __future__ import annotations
 
+# pylint: disable=invalid-name, unsupported-binary-operation
 import tvm
 import tvm_ffi
 from tvm.ir import Range
@@ -22,12 +21,7 @@ class Fragment(Layout):
     # Disable the linter warning about not calling super().__init__()
     # because this object is created via TVM's FFI constructor mechanism.
     # pylint: disable=super-init-not-called
-    def __init__(self,
-                 shape,
-                 forward_fn=None,
-                 forward_thread_fn=None,
-                 replicate=1,
-                 forward_index_fn=None):
+    def __init__(self, shape, forward_fn=None, forward_thread_fn=None, replicate=1, forward_index_fn=None):
         """
         Initialize the Fragment with iteration variables and optional thread replication.
 
@@ -121,10 +115,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_thread_size(self)
 
-    def repeat(self,
-               repeats,
-               repeat_on_thread: bool = False,
-               lower_dim_first: bool = True) -> Fragment:
+    def repeat(self, repeats, repeat_on_thread: bool = False, lower_dim_first: bool = True) -> "Fragment":
         """
         Returns a new Fragment that repeats the iteration space a given number of times.
 
@@ -144,7 +135,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_repeat(self, repeats, repeat_on_thread, lower_dim_first)
 
-    def replicate(self, replicate: int) -> Fragment:
+    def replicate(self, replicate: int) -> "Fragment":
         """
         Replicate the Fragment across a new thread dimension.
 
@@ -160,7 +151,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_replicate(self, replicate)
 
-    def condense_rep_var(self) -> Fragment:
+    def condense_rep_var(self) -> "Fragment":
         """
         Condense or fold the replicate variable into the existing iteration space.
         This operation may be used to reduce dimensionality if the replicate variable
@@ -192,8 +183,7 @@ class Fragment(Layout):
         # The thread dimension (IterVar) is accessed via the `thread` property
         forward_thread = self.thread
         # Construct an IndexMap to map the provided args into the final thread index
-        index_map = IndexMap(
-            initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
+        index_map = IndexMap(initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
         return index_map.map_indices(indices)
 
     def __repr__(self):
@@ -205,9 +195,10 @@ class Fragment(Layout):
         str
             A string showing the thread dimension and the index dimension.
         """
-        return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
+        return self._DebugOutput()
+        # return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
 
-    def is_equal(self, other: Fragment) -> bool:
+    def is_equal(self, other: "Fragment") -> bool:
         """
         Check if the current fragment is equal to another fragment.
         """
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
index 2fd58cd2e45545983d67921ef1b6bf525b8c49ef..7ae836bc88eebe4aa319bc9283e0390c4e96246a 100644
--- a/tilelang/layout/gemm_sp.py
+++ b/tilelang/layout/gemm_sp.py
@@ -1,7 +1,7 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
-
 import tvm
 import tilelang.language as T
 import warnings
@@ -18,7 +18,7 @@ def decompose_col_major(index_1d: int, basis: list[int]) -> list[int]:
     return res
 
 
-def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
+def make_cutlass_metadata_layout_sm90(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
     """Make a layout of metadata that is compatible with cutlass sm90 compression kernel. Note that layout atom is the same for smem and gmem.
 
     Args:
@@ -31,10 +31,20 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
         warnings.warn(f"block_k {block_k} is too large, set to 128 for {mma_dtype}.", stacklevel=2)
-    if mma_dtype not in ["float16", "bfloat16", "float32", "int8", "float8"]:
+    if mma_dtype not in [
+        T.float16,
+        T.bfloat16,
+        T.float32,
+        T.int8,
+        T.float8_e4m3,
+        T.float8_e4m3fn,
+        T.float8_e4m3fnuz,
+        T.float8_e5m2,
+        T.float8_e5m2fnuz,
+    ]:
         raise NotImplementedError(f"Unsupported dtype: {mma_dtype}")
 
-    if buffer.dtype not in ["uint8", "int8"]:
+    if buffer.dtype not in [T.uint8, T.int8]:
         raise ValueError(f"metadata should be 8 bit, got {buffer.dtype}")
 
     bits_map = {
@@ -42,7 +52,11 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         "bfloat16": 16,
         "float32": 32,
         "int8": 8,
-        "float8": 8,
+        "float8_e4m3": 8,
+        "float8_e4m3fn": 8,
+        "float8_e4m3fnuz": 8,
+        "float8_e5m2": 8,
+        "float8_e5m2fnuz": 8,
     }
 
     # ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl#L108-L117
@@ -76,8 +90,8 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         shape_i, shape_k = shape_ik[:3], shape_ik[3:]
         stride_i, stride_k = stride_ik[:3], stride_ik[3:]
     elif bits_map[mma_dtype] == 8:
-        shape_i, shape_k = [64], [BlockK]
-        stride_i, stride_k = [BlockK], [1]
+        shape_i, shape_k = [64], [block_k // 8]
+        stride_i, stride_k = [block_k // 8], [1]
     else:
         raise NotImplementedError(f"Unknown mma type {mma_dtype}")
 
@@ -104,54 +118,44 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
     return T.Layout(shape, transform)
 
 
-def _make_metadata_layout_sm8x_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str):
+def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
     """Make a layout of metadata that is compatible with cutlass sm8x compression kernel. Note that layout atom is the same for smem and gmem.
-
+        ref: https://github.com/pytorch/pytorch/blob/d0c24b392cbb7b213d22e42c52c6c2d1ac2da1bd/torch/sparse/_semi_structured_conversions.py#L5
     Args:
         buffer: metadata buffer shape, for sm80 it should be a 16bit type
     """
 
-    # ref: https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h#L651
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/layout/matrix.h#L405
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/warp/mma_sparse_tensor_op.h#L172
-
-    if mma_dtype in ["float16", "bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
+    if mma_dtype in [T.float16, T.bfloat16] and buffer.dtype not in [T.uint16, T.int16]:
         raise ValueError(f"metadata should be 16 bit, got {buffer.dtype}")
 
-    if mma_dtype in ["float8", "int8", "uint8"] and buffer.dtype not in ["uint32", "int32"]:
+    if mma_dtype in ["float8_e4m3", "float8_e5m2", T.int8, T.uint8] and buffer.dtype not in [T.uint32, T.int32]:
         raise ValueError(f"metadata should be 32 bit, got {buffer.dtype}")
 
-    kInterleaved = 2
-    stride = buffer.shape[0] * kInterleaved
+    m, k = buffer.shape
+    group = 32 if buffer.dtype.bits == 16 else 16
+    interweave = 4 if buffer.dtype.bits == 16 else 2
 
     def ColumnMajorInterleaved(i: int, j: int) -> int:
-        column_major = j // kInterleaved
-        column_minor = j % kInterleaved
-        return column_major * stride + i * kInterleaved + column_minor
+        i = i // group * group + (i % 8) * interweave + (i % group) // 8
+        topright = (1 - (i % 2)) & (j % 2)
+        bottomleft = (i % 2) & (1 - (j % 2))
+        i += topright - bottomleft
+        j -= topright - bottomleft
+        offset = (j // 2) * m * 2 + i * 2 + (j % 2)
+        return offset // k, offset % k
 
     return T.Layout(buffer.shape, ColumnMajorInterleaved)
 
 
-def make_metadata_layout(buffer: tvm.tir.Buffer,
-                         mma_dtype: str = "float16",
-                         backend: str = 'cutlass',
-                         arch: str | None = None,
-                         **extra_args):
+def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer, mma_dtype: str = T.float16, arch: str | None = None, **extra_args):
     if arch is None:
         arch = nvcc.get_target_compute_version()
 
     compute_version = nvcc.parse_compute_version(arch)
 
     if compute_version >= (9, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm90_cutlass(
-                buffer=buffer, mma_dtype=mma_dtype, **extra_args)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm90(buffer=buffer, mma_dtype=mma_dtype, **extra_args)
     elif compute_version >= (8, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm8x_cutlass(buffer=buffer, mma_dtype=mma_dtype)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm8x(buffer=buffer, mma_dtype=mma_dtype)
     else:
         raise NotImplementedError(f"Unsupported architecture: {arch}")
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index 14db12223901e5a5bdc0262ecb31b3f231484de6..fbd39e8de748199a1d2de77d6fa6ae55f96b4438 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -1,7 +1,6 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
-from __future__ import annotations
 
+# pylint: disable=invalid-name, unsupported-binary-operation
 import tvm_ffi
 from tvm.ir import Node, Range
 from tvm.tir import IterVar, Var, PrimExpr, IndexMap
@@ -11,7 +10,6 @@ from tilelang import _ffi_api
 # Register the Layout class as a TVM object under the name "tl.Layout"
 @tvm_ffi.register_object("tl.Layout")
 class Layout(Node):
-
     def __init__(self, shape, forward_fn):
         """
         Initialize a Layout object.
@@ -116,13 +114,13 @@ class Layout(Node):
         index_map = IndexMap(
             initial_indices=forward_vars,  # The original iteration variables
             final_indices=forward_indexes,  # The computed forward indices
-            inverse_index_map=None  # No inverse mapping provided at this stage
+            inverse_index_map=None,  # No inverse mapping provided at this stage
         )
 
         # Map the provided indices using the constructed index mapping
         return index_map.map_indices(indices)
 
-    def inverse(self) -> Layout:
+    def inverse(self) -> "Layout":
         """
         Compute the inverse of the current layout transformation.
 
@@ -133,7 +131,7 @@ class Layout(Node):
         """
         return _ffi_api.Layout_inverse(self)
 
-    def is_equal(self, other: Layout) -> bool:
+    def is_equal(self, other: "Layout") -> bool:
         """
         Check if the current layout is equal to another layout.
 
@@ -145,4 +143,5 @@ class Layout(Node):
         return _ffi_api.Layout_is_equal(self, other)
 
     def __repr__(self):
-        return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
+        return self._DebugOutput()
+        # return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
diff --git a/tilelang/layout/swizzle.py b/tilelang/layout/swizzle.py
index f63c954a30dcadf843afaa199f22cba50abbd5d8..e083d756db52aa130c78a1a31959b1914b3c71c4 100644
--- a/tilelang/layout/swizzle.py
+++ b/tilelang/layout/swizzle.py
@@ -1,15 +1,14 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
 
+# pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
+
 import tvm
 from tvm.tir import Buffer, BufferLoad, BufferRegion
 from tilelang import _ffi_api
 
 
-def _get_buffer_info(
-        buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion
-) -> tuple[Buffer, list[int], str]:
+def _get_buffer_info(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[Buffer, list[int], str]:
     """
     Extract buffer, shape, and dtype from Buffer, BufferLoad, or BufferRegion.
 
@@ -25,12 +24,10 @@ def _get_buffer_info(
         buf = buffer_or_load_or_region.buffer
         return buf, buf.shape, buf.dtype
     else:
-        raise TypeError(
-            f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
 
 
-def _get_stride_continuous(
-        buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
+def _get_stride_continuous(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
     """
     Get stride (last 2nd dimension) and continuous (last dimension) from Buffer, BufferLoad, or BufferRegion.
 
@@ -62,9 +59,7 @@ def _get_element_size(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegi
 
 # Use a stable swizzled layout to ensure consistent memory access patterns.
 # Swizzling should be enabled or disabled based on whether TMA (Tensor Memory Access) is applied.
-def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                         k_major: bool = True,
-                         allow_pad: bool = True):
+def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, k_major: bool = True, allow_pad: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     return _ffi_api.make_swizzled_layout(
@@ -77,9 +72,7 @@ def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for Volta Intrinsics
-def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                               is_a: bool = True,
-                               k_inner: bool = True):
+def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, is_a: bool = True, k_inner: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     return _ffi_api.make_volta_swizzled_layout(
         stride,
@@ -90,9 +83,7 @@ def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for WGMMA Intrinsics
-def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                               continuity: int = None,
-                               k_major: bool = True):
+def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     if continuity is None:
@@ -107,9 +98,7 @@ def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for TCGEN05MMA Intrinsics
-def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                                    continuity: int = None,
-                                    k_major: bool = True):
+def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     if continuity is None:
diff --git a/tilelang/libinfo.py b/tilelang/libinfo.py
index 5af8c84f45fde2faf4f84276197adeb99c1dd520..d82986b7534edc0d4cd0a92864e6da7d85818947 100644
--- a/tilelang/libinfo.py
+++ b/tilelang/libinfo.py
@@ -31,6 +31,5 @@ def find_lib_path(name: str, py_ext=False):
         if os.path.exists(lib_dll_path) and os.path.isfile(lib_dll_path):
             return lib_dll_path
     else:
-        message = (f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" +
-                   "\n".join(TL_LIBS))
+        message = f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" + "\n".join(TL_LIBS)
         raise RuntimeError(message)
diff --git a/tilelang/primitives/__init__.py b/tilelang/primitives/__init__.py
deleted file mode 100644
index 8eccc3e5ce55ff02090ade6c6755f782d13e393f..0000000000000000000000000000000000000000
--- a/tilelang/primitives/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-""" bootstrap the primitives module via tile language """
-
-from .gemm import gemm  # noqa: F401
diff --git a/tilelang/primitives/gemm/__init__.py b/tilelang/primitives/gemm/__init__.py
deleted file mode 100644
index ee9436d15fc2d73c3c51419a58a984ab46d2d926..0000000000000000000000000000000000000000
--- a/tilelang/primitives/gemm/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import annotations
-
-from tvm import tir
-from tilelang.utils import is_local, is_fragment, is_shared
-from tilelang.primitives.gemm.base import GemmWarpPolicy
-from tilelang.primitives.gemm.gemm_mma import (
-    GemmPrimitiveMMA,)
-
-
-def gemm(
-    A: tir.Buffer,
-    B: tir.Buffer,
-    C: tir.Buffer,
-    transpose_A: bool = False,
-    transpose_B: bool = False,
-    block_row_warps: int | None = None,
-    block_col_warps: int | None = None,
-    warp_row_tiles: int | None = None,
-    warp_col_tiles: int | None = None,
-    chunk: int | None = None,
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    k_pack: int = 1,
-):
-    assert is_local(A) or is_fragment(A) or is_shared(A), (
-        f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}")
-    assert is_local(B) or is_fragment(B) or is_shared(B), (
-        f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}")
-    assert is_local(C) or is_fragment(C), (
-        f"Expected C to be a local, fragment, but got {C.scope()}")
-    # TODO(lei): Now we only support Nvidia GPUs
-    # Must enhance the design to implement runtime lowering
-    # for different targets (hip mfma for example)
-    return GemmPrimitiveMMA(
-        A=A,
-        B=B,
-        C=C,
-        transpose_A=transpose_A,
-        transpose_B=transpose_B,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-        policy=policy,
-        k_pack=k_pack,
-    ).invoke()
diff --git a/tilelang/primitives/gemm/gemm_mma.py b/tilelang/primitives/gemm/gemm_mma.py
deleted file mode 100644
index 11e16838c07cd1612c381d8a729434ef590a551e..0000000000000000000000000000000000000000
--- a/tilelang/primitives/gemm/gemm_mma.py
+++ /dev/null
@@ -1,262 +0,0 @@
-from dataclasses import dataclass
-from tvm import tir
-import tilelang.language as T
-from tilelang.utils import is_fragment
-from tilelang.primitives.gemm.base import GemmBaseParams
-from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
-
-
-# TODO(lei): Implement GEMM_SR, GEMM_RS, GEMM_RR
-@dataclass
-class GemmPrimitiveMMA(GemmBaseParams):
-    """
-    A GEMM (General Matrix Multiply) primitive that uses Tensor Core MMA (Matrix
-    Multiply and Accumulate) instructions. Inherits from GemmBaseParams which
-    provides basic parameters such as A, B, C buffers and transposition flags.
-    """
-
-    def gemm_rrr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RRR is not implemented yet")
-
-    def gemm_rsr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-
-        in_dtype = self.in_dtype
-        warp_cols = mma_emitter.warp_cols
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        a_is_fragment = is_fragment(A)
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_rsr(A_local: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if a_is_fragment:
-                # Annotate layout for A_local if it is a fragment.
-                T.annotate_layout({
-                    A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
-                })
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
-
-            # Make default swizzle layout for shared memory
-            # T.annotate_layout({
-            #     B_shared: make_mma_swizzle_layout(B_shared),
-            # })
-            for ki in T.serial(0, (block_K // micro_size_k)):
-
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-                # Perform Matrix Multiplication
-                mma_emitter.mma(
-                    A_local,
-                    B_local,
-                    C_local,
-                    ki,
-                )
-
-        return _gemm_rsr(A, B, C)
-
-    def gemm_srr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RSR is not implemented yet")
-
-    def gemm_ssr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        """
-        Perform a single-step reduction (SSR) GEMM using Tensor Core MMA
-        primitives. Loads fragments of A and B from shared memory, multiplies
-        them, and accumulates into C.
-
-        Parameters
-        ----------
-        A : tir.Buffer
-            The buffer for matrix A (in shared memory).
-        B : tir.Buffer
-            The buffer for matrix B (in shared memory).
-        C : tir.Buffer
-            The buffer for the accumulation results.
-        mma_emitter : TensorCoreIntrinEmitter
-            A helper object responsible for generating Tensor Core MMA
-            instructions (ldmatrix, mma, etc.).
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated IR expression (macro) representing the GEMM loop.
-        """
-
-        in_dtype = self.in_dtype
-        warp_rows = mma_emitter.warp_rows
-        warp_cols = mma_emitter.warp_cols
-        local_size_a = mma_emitter.local_size_a
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_ssr(A_shared: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
-
-            for ki in T.serial(0, (block_K // micro_size_k)):
-                # Load A into fragment
-                mma_emitter.ldmatrix_a(
-                    A_local,
-                    A_shared,
-                    ki,
-                )
-
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-
-                # Perform Matrix Multiplication
-                mma_emitter.mma(A_local, B_local, C_local)
-
-        return _gemm_ssr(A, B, C)
-
-    def invoke(self) -> tir.PrimExpr:
-        """
-        Entry point to generate a GEMM SSR (single-step reduction) with Tensor
-        Core instructions. Performs the following steps:
-            1. Infers block partition parameters if necessary.
-            2. Creates a `TensorCoreIntrinEmitter` with the correct data types
-               and dimensions.
-            3. Invokes the GEMM SSR function to generate the final IR expression.
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated GEMM IR expression.
-        """
-
-        # Infer block partition if necessary
-        current_frame = T.KernelLaunchFrame.Current()
-        threads = current_frame.get_num_threads()
-
-        self.infer_block_partition(threads)
-
-        A, B, C = self.A, self.B, self.C
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        chunk = self.chunk
-
-        # Check dtypes
-        A_dtype, B_dtype, C_dtype = A.dtype, B.dtype, C.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        in_dtype, accum_dtype = A_dtype, C_dtype
-
-        # Create the MMA emitter
-        mma_emitter = TensorCoreIntrinEmitter(
-            a_dtype=in_dtype,
-            b_dtype=in_dtype,
-            accum_dtype=accum_dtype,
-            a_transposed=transpose_A,
-            b_transposed=transpose_B,
-            block_row_warps=block_row_warps,
-            block_col_warps=block_col_warps,
-            warp_row_tiles=warp_row_tiles,
-            warp_col_tiles=warp_col_tiles,
-            chunk=chunk,
-        )
-        a_is_fragment = is_fragment(A)
-        b_is_fragment = is_fragment(B)
-        if a_is_fragment and b_is_fragment:
-            return self.gemm_rrr(A, B, C, mma_emitter)
-        if a_is_fragment:
-            return self.gemm_rsr(A, B, C, mma_emitter)
-        if b_is_fragment:
-            return self.gemm_srr(A, B, C, mma_emitter)
-        return self.gemm_ssr(A, B, C, mma_emitter)
-
-    @property
-    def in_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The input data type for A and B. Assumes both have the same dtype.
-
-        Raises
-        ------
-        AssertionError
-            If A and B do not share the same dtype.
-        """
-        A_dtype, B_dtype = self.A.dtype, self.B.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        return self.A.dtype
-
-    @property
-    def accum_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The accumulation data type for C.
-        """
-        return self.C.dtype
-
-
-__all__ = ["GemmPrimitiveMMA"]
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index c681ee976320cdb8bb20eb34adc676d48f8d0bb2..94d350153caffa38e00523e2b49c1224c9a4afed 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -1,6 +1,6 @@
 """The profiler and convert to torch utils"""
-from __future__ import annotations
 
+from __future__ import annotations
 from typing import Callable, Any, Literal
 from functools import partial
 import torch
@@ -11,7 +11,7 @@ from tilelang.utils.tensor import (
     get_tensor_supply,
     TensorSupplyType,
     torch_assert_close,
-    adapt_torch2tvm,
+    is_float8_dtype,
 )
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter import BaseKernelAdapter
@@ -46,8 +46,7 @@ class Profiler:
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
+                raise ValueError(f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
@@ -114,8 +113,7 @@ class Profiler:
         ref_tensors = ins + ref_outs
         lib_tensors = ins + lib_outs
 
-        assert len(lib_tensors) == len(
-            ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
+        assert len(lib_tensors) == len(ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
         # torch.set_printoptions(edgeitems=torch.inf)
         for lhs, rhs in zip(lib_tensors, ref_tensors):
             # close_mask = torch.isclose(lhs, rhs, rtol=rtol, atol=atol)
@@ -127,17 +125,9 @@ class Profiler:
             if lhs is not None and rhs is not None:
                 # in case of numsplit template, the ref output may be None
                 # which means the value is invalid, so we skip the comparison
-                def is_float8(tensor: torch.Tensor) -> bool:
-                    return tensor.dtype in {
-                        torch.float8_e5m2,
-                        torch.float8_e5m2fnuz,
-                        torch.float8_e4m3fn,
-                        torch.float8_e4m3fnuz,
-                    }
-
                 torch_assert_close(
-                    lhs if not is_float8(lhs) else lhs.to(torch.float32),
-                    rhs if not is_float8(rhs) else rhs.to(torch.float32),
+                    lhs if not is_float8_dtype(lhs.dtype) else lhs.to(torch.float32),
+                    rhs if not is_float8_dtype(rhs.dtype) else rhs.to(torch.float32),
                     rtol=rtol,
                     atol=atol,
                     max_mismatched_ratio=max_mismatched_ratio,
@@ -261,10 +251,9 @@ class Profiler:
             )
         elif profiler == "tvm":
             assert func is not None, "func should not be None"
-            assert isinstance(
-                func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
+            assert isinstance(func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
 
-            ins = (self._get_inputs(with_output=True) if input_tensors is None else input_tensors)
+            ins = self._get_inputs(with_output=True) if input_tensors is None else input_tensors
             target = "cuda"
 
             with suppress(Exception):
@@ -273,11 +262,9 @@ class Profiler:
             assert target in ["cuda", "hip"], f"Unknown target: {target}"
 
             device = tvm.cuda(0) if target == "cuda" else tvm.rocm(0)
-            time_evaluator = self.mod.time_evaluator(
-                self.mod.entry_name, device, number=rep, repeat=n_repeat)
-            tvm_inputs = [adapt_torch2tvm(inp) for inp in ins]
+            time_evaluator = self.mod.time_evaluator(self.mod.entry_name, device, number=rep, repeat=n_repeat)
             # Transform Latency to ms
-            return time_evaluator(*tvm_inputs).mean * 1e3
+            return time_evaluator(*ins).mean * 1e3
         else:
             raise ValueError(f"Unknown profiler: {profiler}")
 
diff --git a/tilelang/profiler/bench.py b/tilelang/profiler/bench.py
index a851ceb3dca13e0e8d7749b18b0c9c9b1765f27e..bfcb5043debc093a1449992ac5592d03b8a768e9 100644
--- a/tilelang/profiler/bench.py
+++ b/tilelang/profiler/bench.py
@@ -1,4 +1,5 @@
 """Profiler and benchmarking utilities for PyTorch functions."""
+
 from __future__ import annotations
 
 import os
@@ -16,8 +17,8 @@ class suppress_stdout_stderr:
 
     def __enter__(self):
         # Open null device files
-        self.outnull_file = open(os.devnull, 'w')
-        self.errnull_file = open(os.devnull, 'w')
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
 
         # Save original file descriptors
         self.old_stdout_fileno_undup = sys.stdout.fileno()
@@ -56,7 +57,7 @@ class suppress_stdout_stderr:
 
 
 IS_CUDA = torch.cuda.is_available()
-device = 'cuda:0' if IS_CUDA else 'mps:0'
+device = "cuda:0" if IS_CUDA else "mps:0"
 Event = torch.cuda.Event if IS_CUDA else torch.mps.Event
 
 
@@ -93,8 +94,7 @@ def do_bench(
     Returns:
         Runtime in milliseconds (float) or list of quantile values if quantiles specified
     """
-    assert return_mode in ["min", "max", "mean", "median"], \
-        f"Invalid return_mode: {return_mode}"
+    assert return_mode in ["min", "max", "mean", "median"], f"Invalid return_mode: {return_mode}"
 
     # Initial function call and synchronization
     fn()
diff --git a/tilelang/quantize/lop3.py b/tilelang/quantize/lop3.py
index 47d91f056fa6bfce7a4e34ccfbb78139c6252ccc..6f1f457d1ed2b692d946e1cbf1de476228fb32df 100644
--- a/tilelang/quantize/lop3.py
+++ b/tilelang/quantize/lop3.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-from __future__ import annotations
 from typing import Literal
+from tilelang import language as T
 
 decode_i4_to_f16 = """
 template <typename T1, typename T2, bool isSigned = false>
@@ -1089,10 +1089,10 @@ __device__ void decode_i2u_to_i4s(T1 *_i4u, T2 *B_local_decode, const int N = 16
 
 
 def get_lop3_intrin_group(
-    out_dtype: Literal["float16", "int8", "int4"],
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.int8, T.int4],
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8"] = "int8",
+    storage_dtype: Literal[T.int32, T.int8] = T.int8,
     with_scaling: bool = False,
     with_zeros: bool = False,
     zeros_mode: Literal["original", "rescale", "quantized"] = "original",
@@ -1105,10 +1105,10 @@ def get_lop3_intrin_group(
 
     Parameters
     ----------
-    in_dtype : Literal["int8"]
+    in_dtype : Literal[T.int8]
         The data type of the input. It should be "int8".
 
-    out_dtype : Literal["float16", "int8", "int4"]
+    out_dtype : Literal[T.float16, T.int8, T.int4]
         The data type of the output. It can be either "float16" or "int8" or "int4".
 
     storage_nbit : int, optional
@@ -1131,21 +1131,17 @@ def get_lop3_intrin_group(
     Dict[str, str]
         A dictionary mapping the names of the intrinsics to their corresponding implementations.
     """
-    assert out_dtype in [
-        "float16", "int8", "int4"
-    ], (f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' .")
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.int8, T.int4], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' ."
 
-    dtype_mapping = {"float16": "f16", "int4": "i4", "int8": "i8", "int32": "i32"}
+    dtype_mapping = {T.float16: "f16", T.int4: "i4", T.int8: "i8", T.int32: "i32"}
     target_dtype = dtype_mapping[out_dtype]
 
-    if source_format not in ["int", "uint"]:
-        raise ValueError(
-            f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
-    if with_zeros and source_format == "int":
+    if source_format not in [T.int, T.uint]:
+        raise ValueError(f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}, {type(source_format)}.")
+    if with_zeros and source_format == T.int:
         raise ValueError(f"Zeros are not supported for signed integers, but got {source_format}")
 
-    source_symbol = "i" if source_format == "int" else "u"
-
     import_c_map = {
         "i4_to_f16": decode_i4_to_f16,
         "i2_to_f16": decode_i2_to_f16,
@@ -1180,15 +1176,15 @@ def get_lop3_intrin_group(
     if is_ladder_stage3:
         key += "_offset"
 
-    if out_dtype == "float16":
+    if out_dtype == T.float16:
         d4f = "f16"
-    elif out_dtype == "int8":
+    elif out_dtype == T.int8:
         d4f = "i8s"
-    elif out_dtype == "int4":
+    elif out_dtype == T.int4:
         d4f = "i4s"
     else:
         raise ValueError(f"Unsupported target dtype: {target_dtype}")
-    source_symbol = "u" if source_format == "uint" else "s"
+    source_symbol = "u" if source_format == T.uint else "s"
     func_name = f"decode_i{source_bit}{source_symbol}_to_{d4f}"
     if with_scaling:
         func_name += "_scale"
diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 0425c549dea9b2b9f16870ce5aa6ba34a12decd2..dd7100a6298f9c89d153f18e405229bc9b79f64d 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -1,5 +1,5 @@
-from __future__ import annotations
 from typing import Literal
+from tilelang import language as T
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -50,10 +50,10 @@ __device__ void decode_fp4_to_bf16_twiddling(T1 *B_local, T2 *B_local_decode, co
 
 
 def get_mxfp_intrin_group(
-    out_dtype: Literal["float16", "bfloat16"] = "bfloat16",
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8", "uint8"] = "uint8",
+    storage_dtype: Literal[T.int32, T.int8, T.uint8] = T.uint8,
     use_twiddling: bool = False,
 ) -> dict[str, str]:
     """
@@ -66,10 +66,10 @@ def get_mxfp_intrin_group(
     `_twiddling`).
 
     Parameters:
-        out_dtype: Target floating-point type for decoded values; either "float16" or "bfloat16".
+        out_dtype: Target floating-point type for decoded values; either T.float16 or T.bfloat16.
         source_format: Integer source representation; "int" or "uint".
         source_bit: Bit width of the packed source format (e.g., 4).
-        storage_dtype: Underlying storage integer dtype (one of "int32", "int8", "uint8").
+        storage_dtype: Underlying storage integer dtype (one of T.int32, T.int8, T.uint8).
         use_twiddling: When True, select the twiddling variant of the decoding intrinsic.
 
     Returns:
@@ -81,15 +81,12 @@ def get_mxfp_intrin_group(
         AssertionError: if out_dtype, source_format, or storage_dtype are not supported.
         KeyError: if the constructed key does not match any available C source implementation.
     """
-    assert out_dtype in ["float16", "bfloat16"
-                        ], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
-    assert source_format in ["int", "uint"
-                            ], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
-    assert storage_dtype in [
-        "int32", "int8", "uint8"
-    ], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.bfloat16], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
+    assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
+    assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
-    dtype_map = {"float16": "f16", "bfloat16": "bf16"}
+    dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     key = f"fp{source_bit}_to_{dtype_map[out_dtype]}"
     if use_twiddling:
         key += "_twiddling"
diff --git a/tilelang/quantize/quantization.py b/tilelang/quantize/quantization.py
index db9d2349dc69c1427182e7ef8b9c3779e50aecea..74a545f258c419b463bf39734504699458b6162a 100644
--- a/tilelang/quantize/quantization.py
+++ b/tilelang/quantize/quantization.py
@@ -16,12 +16,12 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
+
 # The code below is mostly copied from mlc.ai quantization.py in mlc-llm.
 # pylint: disable=invalid-name,missing-function-docstring,unused-variable
 """TIR computation utilities for quantization."""
 
+from tilelang import language as T
 from tilelang import tvm as tvm
 from tvm import tir
 
@@ -36,7 +36,7 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         a bfloat16 constructed from the unpacked sign, a scaled exponent, and the 1-bit mantissa.
 
         Behavior:
-        - Validates `nbit == 4`, `dtype == "bfloat16"`, and `val.dtype == "uint8"` (AssertionError if violated).
+        - Validates `nbit == 4`, `dtype == T.bfloat16`, and `val.dtype == T.uint8` (AssertionError if violated).
         - Extracts the 4-bit field at position `pos` (fields are packed consecutively in `val`).
         - Interprets the 4-bit field as: sign = bit3, exponent = bits1-2, mantissa = bit0.
         - Converts the 2-bit exponent to bf16 exponent space by adding a bias of 126, adds `scale` to that exponent,
@@ -49,27 +49,27 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         - val: uint8 expression containing packed fields.
         - pos: index of the field within `val` (0-based); used to compute the bit shift.
         - scale: exponent-scale to add to the converted exponent (treated as an unsigned integer expression).
-        - dtype: must be "bfloat16".
+        - dtype: must be T.bfloat16.
 
         Returns:
         - A tir.PrimExpr of dtype "bfloat16" representing the decoded and scaled value.
         """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(T.bfloat16,
+                               ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16))
+                                | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16))
     return val_bf16
 
 def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_even: bool = True):
@@ -88,29 +88,29 @@ def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_eve
     Returns:
         tir.PrimExpr: A uint32 PrimExpr containing the packed bfloat16 representations (v0 low 16 bits, v1 high 16 bits).
     """
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     res = []
     for data in [v0, v1]:
-        u32_val = tir.reinterpret("uint32", data)
+        u32_val = tir.reinterpret(T.uint32, data)
         if round_to_even:
-            rounding_bias = ((u32_val >> tir.const(16, "uint32"))
-                             & tir.const(1, "uint32")) + tir.const(0x7FFF, "uint32")
+            rounding_bias = ((u32_val >> tir.const(16, T.uint32))
+                             & tir.const(1, T.uint32)) + tir.const(0x7FFF, T.uint32)
             u32_val += rounding_bias
-        res.append((u32_val >> tir.const(16, "uint32")) & mask)
-    return res[0] | (res[1] << tir.const(16, "uint32"))
+        res.append((u32_val >> tir.const(16, T.uint32)) & mask)
+    return res[0] | (res[1] << tir.const(16, T.uint32))
 
 
 def _tir_u32_to_bf16x2_to_f32x2(x: tir.PrimExpr):
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     x0 = x & mask
     x1 = (x >> 16) & mask
-    return (tir.reinterpret("float32", x << tir.const(16, "uint32")) for x in [x0, x1])
+    return (tir.reinterpret(T.float32, x << tir.const(16, T.uint32)) for x in [x0, x1])
 
 
 def _tir_u32_to_int_to_float(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
-    assert val.dtype == "uint32"
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    return tir.Cast(dtype, (val >> (pos * nbit).astype("uint32")) & mask)
+    assert val.dtype == T.uint32
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    return tir.Cast(dtype, (val >> (pos * nbit).astype(T.uint32)) & mask)
 
 
 def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
@@ -119,7 +119,7 @@ def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1)) - 1
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -130,74 +130,74 @@ def _tir_packed_int_to_int_to_float(storage_nbit: int):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
 
 def _tir_f32_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float32"
-    val_u32 = tir.reinterpret("uint32", val)
+    assert val.dtype == T.float32
+    val_u32 = tir.reinterpret(T.uint32, val)
     # e_f32 >  120 -> e_f4 = min(e_f32 - 120 + M_h, 7)
     # e_f32 == 120 -> e_f4 = 1
     # e_f32 < 120 -> e_f4 = 0
-    m_h = (val_u32 >> tir.const(22, "uint32")) & tir.const(1, "uint32")
-    e_f32 = (val_u32 >> tir.const(23, "uint32")) & tir.const(255, "uint32")
-    s = (val_u32 >> tir.const(31, "uint32"))
+    m_h = (val_u32 >> tir.const(22, T.uint32)) & tir.const(1, T.uint32)
+    e_f32 = (val_u32 >> tir.const(23, T.uint32)) & tir.const(255, T.uint32)
+    s = (val_u32 >> tir.const(31, T.uint32))
     e_f4 = tir.Select(
-        e_f32 > tir.const(120, "uint32"),
-        tir.Min(e_f32 - tir.const(120, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f32 == tir.const(120, "uint32"), tir.const(1, "uint32"),
-                   tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f32 > tir.const(120, T.uint32),
+        tir.Min(e_f32 - tir.const(120, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f32 == tir.const(120, T.uint32), tir.const(1, T.uint32),
+                   tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_f16_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float16"
-    val_u32 = tir.Cast("uint32", tir.reinterpret("uint16", val))
-    m_h = (val_u32 >> tir.const(9, "uint32")) & tir.const(1, "uint32")
-    e_f16 = (val_u32 >> tir.const(10, "uint32")) & tir.const(31, "uint32")
-    s = (val_u32 >> tir.const(15, "uint32"))
+    assert val.dtype == T.float16
+    val_u32 = tir.Cast(T.uint32, tir.reinterpret(T.uint16, val))
+    m_h = (val_u32 >> tir.const(9, T.uint32)) & tir.const(1, T.uint32)
+    e_f16 = (val_u32 >> tir.const(10, T.uint32)) & tir.const(31, T.uint32)
+    s = (val_u32 >> tir.const(15, T.uint32))
     e_f4 = tir.Select(
-        e_f16 > tir.const(8, "uint32"),
-        tir.Min(e_f16 - tir.const(8, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f16 == tir.const(8, "uint32"), tir.const(1, "uint32"), tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f16 > tir.const(8, T.uint32),
+        tir.Min(e_f16 - tir.const(8, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f16 == tir.const(8, T.uint32), tir.const(1, T.uint32), tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_u32_to_f4_to_f32(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float32"
-    assert val.dtype == "uint32"
+    assert dtype == T.float32
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f32 = 0
     # e_f4 != 0 -> e_f32 = e_f4 + 120 = e_f4 | (1111000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    f4 = (val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & mask
-    s = f4 >> tir.const(3, "uint32")
-    e_f4 = f4 & tir.const(7, "uint32")
-    e_f32 = e_f4 | tir.const(120, "uint32")
-    val_f32 = tir.reinterpret("float32",
-                              (e_f32 | (s << tir.const(8, "uint32"))) << tir.const(23, "uint32"))
-    return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float32"), val_f32)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    f4 = (val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & mask
+    s = f4 >> tir.const(3, T.uint32)
+    e_f4 = f4 & tir.const(7, T.uint32)
+    e_f32 = e_f4 | tir.const(120, T.uint32)
+    val_f32 = tir.reinterpret(T.float32,
+                              (e_f32 | (s << tir.const(8, T.uint32))) << tir.const(23, T.uint32))
+    return tir.Select(e_f4 == tir.const(0, T.uint32), tir.const(0, T.float32), val_f32)
 
 
 def _tir_packed_to_fp4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint32"
+    assert dtype == T.float16
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + 8 = e_f4 | (1000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = f4 & tir.const(7, "uint16")
-    e_f16 = e_f4 | tir.const(8, "uint16")
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")).astype("uint16"))
-    return tir.Select(e_f4 == tir.const(0, "uint16"), tir.const(0, "float16"), val_f16)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = f4 & tir.const(7, T.uint16)
+    e_f16 = e_f4 | tir.const(8, T.uint16)
+    val_f16 = tir.reinterpret(T.float16,
+                              ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16)).astype(T.uint16))
+    return tir.Select(e_f4 == tir.const(0, T.uint16), tir.const(0, T.float16), val_f16)
 
 def _tir_packed_to_fp4_to_f16(storage_type="uint", storage_nbit=8):
     storage_dtype = storage_type + str(storage_nbit)
@@ -210,37 +210,37 @@ def _tir_packed_to_fp4_to_f16(storage_type="uint", storage_nbit=8):
         s = f4 >> tir.const(3, storage_dtype)
         e_f4 = f4 & tir.const(7, storage_dtype)
         e_f16 = e_f4 | tir.const(8, storage_dtype)
-        val_f16 = tir.reinterpret("float16",
-                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype("uint16"))
-        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, "float16"), val_f16)
+        val_f16 = tir.reinterpret(T.float16,
+                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype(T.uint16))
+        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, T.float16), val_f16)
 
     return f_convert
 
 def _tir_u8_to_f8_e4m3_to_f16_naive(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    prefix = tir.Select(e4 == tir.const(0, "uint16"), tir.const(0x2000, "uint16"),
-                        tir.const(0x4000, "uint16"))
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | prefix
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    prefix = tir.Select(e4 == tir.const(0, T.uint16), tir.const(0x2000, T.uint16),
+                        tir.const(0x4000, T.uint16))
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | prefix
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | (e4 << tir.const(8, "uint16")) | (e4 << tir.const(7, "uint16"))
-    e_f16 = e_f16 ^ tir.const(0x2000, "uint16")
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | (e4 << tir.const(8, T.uint16)) | (e4 << tir.const(7, T.uint16))
+    e_f16 = e_f16 ^ tir.const(0x2000, T.uint16)
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e5m2_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    return tir.reinterpret("float8_e5m2", val).astype("float16")
+    assert dtype == T.float16
+    return tir.reinterpret("float8_e5m2", val).astype(T.float16)
 
 
 def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
@@ -249,7 +249,7 @@ def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1))
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -283,10 +283,10 @@ def _tir_packed_int_to_int_convert(storage_type="uint", storage_nbit=8):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
diff --git a/tilelang/quantize/utils.py b/tilelang/quantize/utils.py
index 2447ca16724535b75e1cffd32c3ee966cb4d7397..2d092a0bab95b7c1a6e34fbbdff61ac76da340d2 100644
--- a/tilelang/quantize/utils.py
+++ b/tilelang/quantize/utils.py
@@ -1,6 +1,7 @@
 def gen_quant4(k, n, groupsize=-1):
     import torch
     import torch.nn as nn
+
     maxq = 2**4
     w = torch.randn((k, n), dtype=torch.half, device="cpu")
 
@@ -48,6 +49,7 @@ def gen_quant4(k, n, groupsize=-1):
 
 def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     import torch
+
     if storage_dtype is None:
         storage_dtype = torch.int8
     elems_per_byte = 8 // source_bits
@@ -56,11 +58,11 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     int8_weight = torch.zeros(
         (*lowprecision_weight.shape[:-1], lowprecision_weight.shape[-1] // elems_per_byte),
         dtype=torch.int8,
-        device=lowprecision_weight.device)
+        device=lowprecision_weight.device,
+    )
     for j in range(lowprecision_weight.shape[-1] // elems_per_byte):
         for k in range(elems_per_byte):
-            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] <<
-                                    (source_bits * k)).to(torch.int8)
+            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] << (source_bits * k)).to(torch.int8)
 
     return int8_weight.to(storage_dtype)
 
@@ -82,6 +84,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         interleave_weight(qweight, 4, "float16")
     """
     import torch
+
     assert target_dtype in ["float16", "int8"]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(torch.int32)
diff --git a/tilelang/testing/__init__.py b/tilelang/testing/__init__.py
index 6a2031492fed1550c3c3cd9ce424b3b5349f8429..635fad365ce4d867f255531f01c0b9c2dfefe616 100644
--- a/tilelang/testing/__init__.py
+++ b/tilelang/testing/__init__.py
@@ -5,20 +5,19 @@ import random
 import torch
 import numpy as np
 from tilelang.contrib import nvcc
-from tvm.testing.utils import (requires_cuda, requires_package, requires_llvm, requires_metal,
-                               requires_rocm, _compose)
+from tvm.testing.utils import requires_cuda, requires_package, requires_llvm, requires_metal, requires_rocm, _compose
 
 from tilelang.utils.tensor import torch_assert_close as torch_assert_close
 
 __all__ = [
-    'requires_package',
-    'requires_cuda',
-    'requires_metal',
-    'requires_rocm',
-    'requires_llvm',
-    'main',
-    'requires_cuda_compute_version',
-] + [f'requires_cuda_compute_version_{op}' for op in ('ge', 'gt', 'le', 'lt', 'eq')]
+    "requires_package",
+    "requires_cuda",
+    "requires_metal",
+    "requires_rocm",
+    "requires_llvm",
+    "main",
+    "requires_cuda_compute_version",
+] + [f"requires_cuda_compute_version_{op}" for op in ("ge", "gt", "le", "lt", "eq")]
 
 
 # pytest.main() wrapper to allow running single test file
diff --git a/tilelang/tileop/__init__.py b/tilelang/tileop/__init__.py
index 5656494fe0b7261a3a19a2d37b77217ce57a5a05..6e7798a05104bc70bbe4eb62a3b2ac3958217ed7 100644
--- a/tilelang/tileop/__init__.py
+++ b/tilelang/tileop/__init__.py
@@ -1 +1,3 @@
+from .base import GemmWarpPolicy  # noqa: F401
 from .gemm import GemmPy  # noqa: F401
+from .gemm_sp import GemmSPPy  # noqa: F401
diff --git a/tilelang/primitives/gemm/base.py b/tilelang/tileop/base.py
similarity index 56%
rename from tilelang/primitives/gemm/base.py
rename to tilelang/tileop/base.py
index 827ff78f9288a5d80c731956370d22a41acc75d6..f7b51b3ac5d62bb4dc7a93fdae40b77cd2f5246e 100644
--- a/tilelang/primitives/gemm/base.py
+++ b/tilelang/tileop/base.py
@@ -1,8 +1,5 @@
 from __future__ import annotations
 from enum import IntEnum
-from dataclasses import dataclass
-
-from tvm import tir
 
 
 class GemmWarpPolicy(IntEnum):
@@ -131,7 +128,7 @@ class GemmWarpPolicy(IntEnum):
             # Try to find the best balanced partition
             best_m = 1
             best_n = 1
-            best_balance = float('inf')
+            best_balance = float("inf")
 
             # Try all possible combinations that satisfy the constraints
             for m in range(1, min(max_m_warps, num_warps) + 1):
@@ -186,130 +183,3 @@ class GemmWarpPolicy(IntEnum):
             return cls.FullCol
         else:
             return cls.Square
-
-
-@dataclass
-class GemmBaseParams:
-    # OP Related Config
-    A: tir.Buffer
-    B: tir.Buffer
-    C: tir.Buffer
-
-    transpose_A: bool = False
-    transpose_B: bool = False
-    block_row_warps: int | None = None
-    block_col_warps: int | None = None
-    warp_row_tiles: int | None = None
-    warp_col_tiles: int | None = None
-    chunk: int | None = None
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    k_pack: int = 1
-
-    def get_warp_size(self) -> int:
-        # must rewrite to 64 if the target
-        # is cdna mfma
-        return 32
-
-    def params_as_dict(self):
-        return {
-            "A": self.A,
-            "B": self.B,
-            "C": self.C,
-            "transpose_A": self.transpose_A,
-            "transpose_B": self.transpose_B,
-            "block_row_warps": self.block_row_warps,
-            "block_col_warps": self.block_col_warps,
-            "warp_row_tiles": self.warp_row_tiles,
-            "warp_col_tiles": self.warp_col_tiles,
-            "chunk": self.chunk,
-            "policy": self.policy,
-            "k_pack": self.k_pack,
-        }
-
-    def infer_block_partition(self, threads: int | None) -> None:
-        """
-        Infer and set block partition parameters (e.g., block_row_warps,
-        block_col_warps, warp_row_tiles, warp_col_tiles, chunk) based on the
-        shape of A and B. If these parameters are not already specified, the
-        method will attempt to infer them automatically based on the given
-        `threads`.
-
-        Parameters
-        ----------
-        threads : Optional[int]
-            The total number of threads in a block. Must be provided
-            if any block partition parameter is not already set.
-
-        Raises
-        ------
-        AssertionError
-            If `threads` is None but any block partition parameter is missing,
-            or if A and B have inconsistent shapes for GEMM.
-        """
-
-        warp_size = self.get_warp_size()
-        A, B = self.A, self.B
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        policy = self.policy
-
-        # The field `chunk` is not declared in GemmBaseParams by default.
-        # We infer it based on the K dimension of matrices.
-        # Initialize chunk from `self` if it exists; otherwise we infer it.
-        chunk = getattr(self, "chunk", None)
-
-        # Determine whether block partition parameters need to be inferred
-        require_infer = (
-            block_row_warps is None or block_col_warps is None or warp_row_tiles is None or
-            warp_col_tiles is None or chunk is None)
-
-        A_shape, B_shape = A.shape, B.shape
-
-        if require_infer:
-            assert (threads is not None), "threads must be provided for auto inference"
-            # Auto-inference only supports 2D matrix multiplication
-            assert (
-                len(A_shape) == 2 and len(B_shape) == 2
-            ), f"Only support 2D matrix multiplication, got {len(A_shape)}D and {len(B_shape)}D"
-
-            # Analyze A/B shapes
-            AM = A_shape[1] if transpose_A else A_shape[0]  # M dimension
-            BN = B_shape[0] if transpose_B else B_shape[1]  # N dimension
-            AK = A_shape[0] if transpose_A else A_shape[1]  # K dimension
-            BK = B_shape[1] if transpose_B else B_shape[0]  # K dimension
-            assert AK == BK, "A and B shape mismatch"
-
-            block_M = int(AM)
-            block_N = int(BN)
-            num_warps = threads // warp_size
-
-            # Infer block partition using a user-specified policy
-            block_row_warps, block_col_warps = policy.compute_warp_partition(
-                block_M, block_N, num_warps)
-            warp_row_tiles = block_M // block_row_warps
-            warp_col_tiles = block_N // block_col_warps
-            chunk = int(AK)
-
-        # rewrite the values
-        self.block_row_warps = block_row_warps
-        self.block_col_warps = block_col_warps
-        self.warp_row_tiles = warp_row_tiles
-        self.warp_col_tiles = warp_col_tiles
-        self.chunk = chunk
-
-    @property
-    def class_attributes(self):
-        return self.params_as_dict()
-
-    def __repr__(self) -> str:
-        cls_name = self.__class__.__name__
-        fields = self.class_attributes
-        field_str = ", ".join(f"{key}={value!r}" for key, value in fields.items())
-        return f"{cls_name}({field_str})"
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
index 4c6762450a779ea0782dd67c77751a5be162e60d..850b6f3b0a3fc4804f98a0ca7973ca6941d381c2 100644
--- a/tilelang/tileop/gemm/__init__.py
+++ b/tilelang/tileop/gemm/__init__.py
@@ -3,9 +3,9 @@ from tilelang import tvm as tvm
 from tvm import tir
 from tvm.target import Target
 from tvm.ir.base import Node
+from tvm.ir import Range
 from tvm.runtime import Scriptable
 import tvm_ffi
-from tilelang.ir import GemmWarpPolicy as GemmWarpPolicy
 from .gemm_mma import GemmMMA
 from .gemm_mma_sm70 import GemmMMASm70
 from .gemm_wgmma import GemmWGMMA
@@ -16,13 +16,13 @@ from tilelang.utils.target import target_is_volta
 
 
 @tvm_ffi.register_global_func("tl.gemm_py.infer_layout")
-def gemm_py_infer_layout(gemm_py, target, thread_bounds):
+def gemm_py_infer_layout(gemm_py: GemmMMA, target: Target, thread_bounds: Range):
     thread_nums = thread_bounds.extent
     return gemm_py.infer_layout(target, thread_nums)
 
 
 @tvm_ffi.register_global_func("tl.gemm_py.lower")
-def gemm_py_lower(gemm_py, layout_map, target, thread_bounds, thread_var):
+def gemm_py_lower(gemm_py: GemmMMA, layout_map, target: Target, thread_bounds: Range, thread_var: tir.Var):
     thread_nums = thread_bounds.extent
     stmt = gemm_py.lower(layout_map, target, thread_nums, thread_var)
     return stmt
diff --git a/tilelang/tileop/gemm/gemm_base.py b/tilelang/tileop/gemm/gemm_base.py
index 021f59a40b3f00be9212be0b0177d4cec7ed001a..7d31ae46d76884bfde14ebb9d0e715980682ecb0 100644
--- a/tilelang/tileop/gemm/gemm_base.py
+++ b/tilelang/tileop/gemm/gemm_base.py
@@ -2,8 +2,9 @@ from dataclasses import dataclass
 from tilelang import tvm as tvm
 from tvm.target import Target
 from tvm import tir
+from tilelang import language as T
 from tilelang.utils.language import is_shared, is_fragment
-from tilelang.ir import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 from tvm.ir.base import Node
 from tvm.ir import PrimExpr
 
@@ -121,13 +122,17 @@ class GemmBase:
 
     @property
     def mbarptr(self) -> PrimExpr:
-        return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, "uint32"))
+        return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, T.uint32))
+
+    @property
+    def mbar(self) -> tir.Buffer:
+        return getattr(self.gemm_node, "mbar", None)
 
     @property
     def C_coords(self):
         coords = getattr(self.gemm_node, "cCoords", None)
         if coords is None or len(coords) == 0:
-            zero = tvm.tir.const(0, "int32")
+            zero = tvm.tir.const(0, T.int32)
             return [zero, zero]
         return [coords[i] for i in range(len(coords))]
 
diff --git a/tilelang/tileop/gemm/gemm_mfma.py b/tilelang/tileop/gemm/gemm_mfma.py
index 45a53d3c0cad80ec55204c7b39feb5f74c3b9ae9..d827d8a2a3fa9ad901dadf214e576f942619f0af 100644
--- a/tilelang/tileop/gemm/gemm_mfma.py
+++ b/tilelang/tileop/gemm/gemm_mfma.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mfma_macro_generator import (
-    MatrixCoreIntrinEmitter,)
+    MatrixCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -11,10 +12,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmMFMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -28,6 +27,7 @@ class GemmMFMA(GemmBase):
             warp_row_tiles=warp_row_tiles,
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
+            k_pack=self.k_pack,
         )
 
         if self.is_gemm_ss():
@@ -55,12 +55,10 @@ class GemmMFMA(GemmBase):
                 self.C: mfma_emitter.make_mfma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -75,6 +73,7 @@ class GemmMFMA(GemmBase):
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
             thread_var=thread_var,
+            k_pack=self.k_pack,
         )
 
         in_dtype = self.in_dtype
@@ -110,11 +109,11 @@ class GemmMFMA(GemmBase):
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
-                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -145,13 +144,12 @@ class GemmMFMA(GemmBase):
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
 
                 if clear_accum:
                     T.clear(C_buf)
 
-                for ki in T.serial(0, (block_K // micro_size_k)):
-
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -177,11 +175,10 @@ class GemmMFMA(GemmBase):
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // micro_size_k)):
-
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Load B into fragment
                     mfma_emitter.ldmatrix_b(
                         B_local,
@@ -207,7 +204,7 @@ class GemmMFMA(GemmBase):
                 accumulating into C_local.
                 """
 
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Perform Matrix Multiplication
                     mfma_emitter.mfma(A_buf, B_buf, C_buf, ki)
 
@@ -215,8 +212,7 @@ class GemmMFMA(GemmBase):
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma.py b/tilelang/tileop/gemm/gemm_mma.py
index ce27409bb60ae0d855345810ce38ce06e701604c..b15173483813aa28f0a72d74260fba4b23dab3e7 100644
--- a/tilelang/tileop/gemm/gemm_mma.py
+++ b/tilelang/tileop/gemm/gemm_mma.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -11,10 +12,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmMMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -54,12 +53,10 @@ class GemmMMA(GemmBase):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -177,7 +174,6 @@ class GemmMMA(GemmBase):
                 if clear_accum:
                     T.clear(C_buf)
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
@@ -211,8 +207,7 @@ class GemmMMA(GemmBase):
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rrr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma_sm70.py b/tilelang/tileop/gemm/gemm_mma_sm70.py
index 12b729c27583421eb1a62618302b223cf1f3ab7f..52a4bf3262f0054be28158a0d3c0db7863512ddf 100644
--- a/tilelang/tileop/gemm/gemm_mma_sm70.py
+++ b/tilelang/tileop/gemm/gemm_mma_sm70.py
@@ -2,7 +2,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_volta_swizzled_layout
 from tilelang.intrinsics.mma_sm70_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -12,10 +13,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmMMASm70(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -45,12 +44,10 @@ class GemmMMASm70(GemmBase):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -140,7 +137,6 @@ class GemmMMASm70(GemmBase):
                     T.clear(C_buf)
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
@@ -155,8 +151,7 @@ class GemmMMASm70(GemmBase):
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index 52c192e5ba72516815736ccd1f00cbfbac6c9103..de3e72143c098a10dafaa2b2df1b440c8b56bba2 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_tcgen05mma_swizzled_layout
 from tilelang.intrinsics.tcgen05_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
 from tvm import tir
@@ -18,10 +19,8 @@ _FLOAT8_DTYPES = {
 
 
 class GemmTCGEN5(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -40,27 +39,20 @@ class GemmTCGEN5(GemmBase):
         b_is_k_major = self.trans_B
 
         if self.is_gemm_ss():
-
             a_continuity = self.M if a_is_k_major else 4 * self.K // m_warp
             b_continuity = self.K if b_is_k_major else self.N // n_warp
 
             return {
                 # WGMMA does not support padding
-                self.A:
-                    make_tcgen05mma_swizzled_layout(
-                        self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B:
-                    make_tcgen05mma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: make_tcgen05mma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
+                self.B: make_tcgen05mma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         # No special swizzle requirement; rely on existing layout.
         return {}
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -82,8 +74,9 @@ class GemmTCGEN5(GemmBase):
             mma_emitter._assign_b_shared_layout(layout_map[self.B])
 
         if not self.is_gemm_ss():
-            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got "
-                             f"A scope {self.A.scope()}, B scope {self.B.scope()}")
+            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got A scope {self.A.scope()}, B scope {self.B.scope()}")
+
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K)
 
         if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
             raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
@@ -94,27 +87,28 @@ class GemmTCGEN5(GemmBase):
         if self.wg_wait != -1:
             raise ValueError("TCGEN5MMA currently requires wg_wait == -1")
 
-        mbarptr = self.mbarptr
-        if mbarptr == 0:
-            raise ValueError("TCGEN5MMA requires a valid mbarrier pointer")
+        mbar = self.mbar
+        if mbar == 0:
+            raise ValueError("TCGEN5MMA requires a valid mbarrier")
+
+        mbarptr = mbar.access_ptr("rw")
 
         C_coords = self.C_coords
         if len(C_coords) != 2:
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype != "float32":
+        if accum_dtype not in [str(T.float32), str(T.float16)]:
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
         B_shared = self.BRegion
         C_local = self.C
         clear_accum = self.clear_accum
-        mbar = self.mbarptr
 
         @T.prim_func
         def _gemm_ss() -> None:
             if thread_var // 32 == 0:
-                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbar, clear_accum)
+                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbarptr, clear_accum)
 
         return _Simplify(_gemm_ss, inline_let=True)
diff --git a/tilelang/tileop/gemm/gemm_wgmma.py b/tilelang/tileop/gemm/gemm_wgmma.py
index 2325f45df65fb80e2ad3c29636cacb1f55aa8b6f..038aa2cd66692bb50386ccec669083c615882420 100644
--- a/tilelang/tileop/gemm/gemm_wgmma.py
+++ b/tilelang/tileop/gemm/gemm_wgmma.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_wgmma_swizzled_layout
 from tilelang.intrinsics.wgmma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -11,10 +12,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmWGMMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -38,33 +37,22 @@ class GemmWGMMA(GemmBase):
 
             return {
                 # WGMMA does not support padding
-                self.A:
-                    make_wgmma_swizzled_layout(
-                        self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: make_wgmma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
+                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         elif self.is_gemm_rs():
             b_continuity = self.N if b_is_k_major else 4 * self.K // n_warp
             return {
-                self.A:
-                    mma_emitter.make_mma_load_layout(self.A, matrix="A"),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
 
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
@@ -133,8 +121,7 @@ class GemmWGMMA(GemmBase):
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
-        raise ValueError(
-            f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
+        raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm_sp/__init__.py b/tilelang/tileop/gemm_sp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d75657ec6c49333d184581880e861fed468c114
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/__init__.py
@@ -0,0 +1,69 @@
+from tilelang import tvm as tvm
+from tvm import tir
+from tilelang.utils.target import (
+    target_is_cuda,
+)
+from tvm.target import Target
+from tvm.ir.base import Node
+from tvm.ir import Range
+from tvm.runtime import Scriptable
+import tvm_ffi
+from tilelang.tileop.base import GemmWarpPolicy
+from .gemm_sp_mma import GemmSPMMA
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.infer_layout")
+def gemm_sp_py_infer_layout(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range):
+    thread_nums = thread_bounds.extent
+    return gemm_sp_py.infer_layout(target, thread_nums)
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.lower")
+def gemm_sp_py_lower(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range, thread_var: tir.Var):
+    thread_nums = thread_bounds.extent
+    stmt = gemm_sp_py.lower(target, thread_nums, thread_var)
+    return stmt
+
+
+@tvm_ffi.register_object("tl.GemmSPPy")
+class GemmSPPy(Node, Scriptable):
+    A: tir.Buffer
+    E: tir.Buffer
+    B: tir.Buffer
+    C: tir.Buffer
+
+    APtr: tir.PrimExpr
+    EPtr: tir.PrimExpr
+    BPtr: tir.PrimExpr
+    CPtr: tir.PrimExpr
+
+    M: int
+    N: int
+    K: int
+
+    trans_A: bool
+    trans_B: bool
+
+    stride_A: int
+    stride_B: int
+    offset_A: int
+    offset_B: int
+    clear_accum: bool
+    k_pack: int
+    wg_wait: int
+    policy: GemmWarpPolicy
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            return GemmSPMMA(self).infer_layout(target, thread_nums)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            # Now only implement ssr layout
+            return GemmSPMMA(self).lower(target, thread_nums, thread_var)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_base.py b/tilelang/tileop/gemm_sp/gemm_sp_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..8226a066417d5f78c1d4c9b70e35a58a7d382568
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_base.py
@@ -0,0 +1,131 @@
+from dataclasses import dataclass
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang.tileop.base import GemmWarpPolicy
+from tvm.ir.base import Node
+
+
+@dataclass
+class GemmSPBase:
+    gemm_sp_node: Node
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        raise NotImplementedError("infer_layout is not implemented")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        raise NotImplementedError("lower is not implemented")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
+
+    @property
+    def M(self) -> int:
+        return self.gemm_sp_node.M
+
+    @property
+    def N(self) -> int:
+        return self.gemm_sp_node.N
+
+    @property
+    def K(self) -> int:
+        return self.gemm_sp_node.K
+
+    @property
+    def trans_A(self) -> bool:
+        return self.gemm_sp_node.trans_A
+
+    @property
+    def trans_B(self) -> bool:
+        return self.gemm_sp_node.trans_B
+
+    @property
+    def trans_E(self) -> bool:
+        return self.gemm_sp_node.trans_E
+
+    @property
+    def e_dtype(self) -> str:
+        return self.E.dtype
+
+    @property
+    def in_dtype(self) -> str:
+        assert self.A.dtype == self.B.dtype, "A and B must have the same dtype"
+        return self.A.dtype
+
+    @property
+    def accum_dtype(self) -> str:
+        return self.C.dtype
+
+    @property
+    def A(self) -> tir.Buffer:
+        return self.gemm_sp_node.A
+
+    @property
+    def E(self) -> tir.Buffer:
+        return self.gemm_sp_node.E
+
+    @property
+    def B(self) -> tir.Buffer:
+        return self.gemm_sp_node.B
+
+    @property
+    def C(self) -> tir.Buffer:
+        return self.gemm_sp_node.C
+
+    @property
+    def ARegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ARegion
+
+    @property
+    def ERegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ERegion
+
+    @property
+    def BRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.BRegion
+
+    @property
+    def CRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.CRegion
+
+    @property
+    def stride_A(self) -> int:
+        return self.gemm_sp_node.stride_A
+
+    @property
+    def stride_B(self) -> int:
+        return self.gemm_sp_node.stride_B
+
+    @property
+    def offset_A(self) -> int:
+        return self.gemm_sp_node.offset_A
+
+    @property
+    def offset_B(self) -> int:
+        return self.gemm_sp_node.offset_B
+
+    @property
+    def clear_accum(self) -> bool:
+        return self.gemm_sp_node.clear_accum
+
+    @property
+    def k_pack(self) -> int:
+        return self.gemm_sp_node.k_pack
+
+    @property
+    def wg_wait(self) -> int:
+        return self.gemm_sp_node.wg_wait
+
+    @property
+    def policy(self) -> GemmWarpPolicy:
+        return self.gemm_sp_node.policy
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_mma.py b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a0d4a9ed8a3800e0a2017e7ee3a9b7995af49e
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
@@ -0,0 +1,241 @@
+from .gemm_sp_base import GemmSPBase
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+
+
+class GemmSPMMA(GemmSPBase):
+    def infer_layout(self, target: Target, thread_nums: int):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+        )
+        if self.is_gemm_ss():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_sr():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rs():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rr():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+            thread_var=thread_var,
+        )
+
+        in_dtype = self.in_dtype
+        warp_rows = mma_emitter.warp_rows
+        warp_cols = mma_emitter.warp_cols
+        local_size_a = mma_emitter.local_size_a
+        local_size_e = mma_emitter.local_size_e
+        local_size_b = mma_emitter.local_size_b
+        micro_size_k = mma_emitter.micro_size_k
+        A_shared = self.A
+        E_shared = self.E
+        B_shared = self.B
+        C_local = self.C
+        assert micro_size_k <= self.K, f"K dimension {self.K} should be >= micro size k {micro_size_k}"
+        if self.is_gemm_ss():
+
+            @T.prim_func
+            def _gemm_ssr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_ssr, inline_let=True)
+        elif self.is_gemm_sr():
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_srr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            # alloc_buffers body
+            # insert into parent block
+            return _Simplify(_gemm_srr, inline_let=True)
+        elif self.is_gemm_rs():
+            A_local = self.A
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rsr, inline_let=True)
+        elif self.is_gemm_rr():
+            A_local = self.A
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_rrr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rrr, inline_let=True)
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
diff --git a/tilelang/tools/Analyzer.py b/tilelang/tools/Analyzer.py
index 205c647e3e54b2773c8a7f25e23aff6183ec9f9b..3af5222f29fb24cb2907d43da659a3efa28e7223 100644
--- a/tilelang/tools/Analyzer.py
+++ b/tilelang/tools/Analyzer.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass
 from tilelang import tvm
 from tvm.tir.stmt_functor import ir_transform
 import logging
+
 # Configuration for different hardware architectures.
 # Each entry contains: (cores per SM, default clock (GHz), FLOPs per cycle, max SM count)
 ARCH_CONFIGS = {"80": (128, 1.41, 2, 108), "86": (128, 1.70, 2, 84), "89": (128, 2.52, 2, 128)}
@@ -23,6 +24,7 @@ class AnalysisResult:
         tflops: Achieved TFLOPS (trillions of FLOPs per second).
         bandwidth_GBps: Achieved memory bandwidth in GB/s.
     """
+
     total_flops: int
     total_global_bytes: int
     estimated_time: float
@@ -81,7 +83,7 @@ class Analyzer:
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         total_bytes = bytes_transferred * loop_product * total_blocks
         self.total_global_bytes += total_bytes
@@ -100,7 +102,7 @@ class Analyzer:
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         self.total_flops += flops_per_call * loop_product * total_blocks
 
@@ -127,8 +129,7 @@ class Analyzer:
                         iter_var = stmt.node
                         thread_tag = iter_var.thread_tag
                         if thread_tag in self.block_counts:
-                            extent = stmt.value.value if hasattr(stmt.value,
-                                                                 'value') else stmt.value
+                            extent = stmt.value.value if hasattr(stmt.value, "value") else stmt.value
                             self.block_counts[thread_tag] = extent
                 elif isinstance(stmt, tvm.tir.For):
                     # Push loop extent onto the stack
@@ -178,9 +179,7 @@ class Analyzer:
             """
             arch_key = device.compute_capability[:2]
             if arch_key not in ARCH_CONFIGS:
-                logger.info(
-                    f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None"
-                )
+                logger.info(f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None")
                 return None
 
             cores_per_sm, default_clock, flops_per_cycle, compute_max_core = ARCH_CONFIGS[arch_key]
@@ -203,7 +202,8 @@ class Analyzer:
             total_global_bytes=self.total_global_bytes,
             estimated_time=estimated_time,
             expected_tflops=peak_tflops,
-            expected_bandwidth_GBps=bandwidth_GBps)
+            expected_bandwidth_GBps=bandwidth_GBps,
+        )
 
     @classmethod
     def analysis(cls, fn, device):
diff --git a/tilelang/tools/plot_layout.py b/tilelang/tools/plot_layout.py
index 291da25712b32b056e577acc7a2042bf7dee6580..299c3e86b6325c84e43c575c6dc13729725b736b 100644
--- a/tilelang/tools/plot_layout.py
+++ b/tilelang/tools/plot_layout.py
@@ -1,11 +1,15 @@
+from __future__ import annotations
 import tilelang.language as T
 
 
-def plot_layout(layout: T.Layout,
-                save_directory="./tmp",
-                name: str = "layout",
-                colormap: str = "RdPu",
-                verbose: bool = False) -> None:
+def plot_layout(
+    layout: T.Fragment,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = "RdPu",
+    verbose: bool = False,
+    formats: str | list[str] = "png",
+) -> None:
     """
     Plot the layout of a buffer.
 
@@ -21,7 +25,8 @@ def plot_layout(layout: T.Layout,
         The colormap to use for visualization (default is "RdPu").
     verbose : bool, optional
         If True, prints additional information about the mapping (default is False).
-
+    formats : str | list[str], optional
+        The formats to save the image in (default is "png").
     Returns
     -------
     None
@@ -82,6 +87,23 @@ def plot_layout(layout: T.Layout,
     raw_colors = [cmap(i) for i in range(num_threads)]
     colors = raw_colors.copy()
 
+    # Show the distribution of registers in each thread of a warp.
+    warp_size = 32
+    # Warn if the number of threads is less than the warp size
+    if num_threads < warp_size:
+        import warnings
+
+        warnings.warn(
+            f"Layout visualization has {num_threads} threads, which is less than the warp size ({warp_size}). "
+            f"For the best viewing experience, it is recommended to have at least {warp_size} threads.",
+            UserWarning,
+            stacklevel=2,
+        )
+    spectral_camp = plt.get_cmap("hsv", warp_size * 6)
+
+    for i in range(min(warp_size, num_threads)):
+        colors[i] = spectral_camp(i * 6)
+
     # Determine the number of rows and columns in the input shape
     nrows, ncols = input_shape
     # Adjust figure size to maintain square cells
@@ -100,12 +122,7 @@ def plot_layout(layout: T.Layout,
 
             color = colors[thread_ids[0]]  # Select color based on thread ID
             # Create a rectangle patch for visualization
-            rect = patches.Rectangle((j, i),
-                                     1,
-                                     1,
-                                     linewidth=0.5,
-                                     edgecolor='black',
-                                     facecolor=color)
+            rect = patches.Rectangle((j, i), 1, 1, linewidth=0.5, edgecolor="black", facecolor=color)
             ax.add_patch(rect)  # Add the rectangle to the plot
 
             # Add text annotations inside the rectangles
@@ -121,41 +138,19 @@ def plot_layout(layout: T.Layout,
             thread_fontsize = min(font_size, font_size * (4 / len(thread_str)))
 
             # Add thread ID text with adjusted font size
-            ax.text(
-                j + 0.5,
-                i + 0.3,
-                thread_str,
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=thread_fontsize)
+            ax.text(j + 0.5, i + 0.3, thread_str, ha="center", va="center", color="black", fontsize=thread_fontsize)
             # Add local ID text with original font size
-            ax.text(
-                j + 0.5,
-                i + 0.7,
-                f"L{local_id}",
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=font_size)
+            ax.text(j + 0.5, i + 0.7, f"L{local_id}", ha="center", va="center", color="black", fontsize=font_size)
 
     # Add row labels to the left side of the plot
     for i in range(nrows):
         text = f"row {i}"
-        ax.text(-0.75, i + 0.5, text, ha='center', va='center', color='black', fontsize=font_size)
+        ax.text(-0.75, i + 0.5, text, ha="center", va="center", color="black", fontsize=font_size)
 
     # Add column labels at the top of the plot
     for j in range(ncols):
         text = f"col {j}"
-        ax.text(
-            j + 0.5,
-            -0.5,
-            text,
-            ha='center',
-            va='center',
-            color='black',
-            fontsize=font_size,
-            rotation=45)
+        ax.text(j + 0.5, -0.5, text, ha="center", va="center", color="black", fontsize=font_size, rotation=45)
 
     # Set the plot limits
     ax.set_xlim(0, ncols)
@@ -171,17 +166,15 @@ def plot_layout(layout: T.Layout,
     legend_x = 1.0 + (0.5 / fig_width)  # Adjust x position based on figure width
     legend_y = 1.0 + (1.7 / fig_height)  # Adjust y position based on figure height
 
-    legend_patches = [
-        patches.Patch(color='black', label="T: Thread ID"),
-        patches.Patch(color='black', label="L: Local ID")
-    ]
+    legend_patches = [patches.Patch(color="black", label="T: Thread ID"), patches.Patch(color="black", label="L: Local ID")]
     ax.legend(
         handles=legend_patches,
         loc="upper right",
         fontsize=font_size - 4,
         frameon=False,
         bbox_to_anchor=(legend_x, legend_y),  # Dynamic position
-        ncols=2)
+        ncols=2,
+    )
 
     # Create the output directory if it does not exist
     tmp_directory = pathlib.Path(save_directory)
@@ -191,17 +184,31 @@ def plot_layout(layout: T.Layout,
     # Save the figure in multiple formats
     plt.tight_layout()
 
-    # Save as PDF
-    pdf_path = tmp_directory / f"{name}.pdf"
-    plt.savefig(pdf_path, bbox_inches="tight")
-    print(f"Saved pdf format into {pdf_path}")
-
-    # Save as PNG
-    png_path = tmp_directory / f"{name}.png"
-    plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
-    print(f"Saved png format into {png_path}")
-
-    # Save as SVG
-    svg_path = tmp_directory / f"{name}.svg"
-    plt.savefig(svg_path, bbox_inches="tight", format="svg")
-    print(f"Saved svg format into {svg_path}")
+    if isinstance(formats, str):
+        formats_str = formats.strip().lower()
+        if formats_str == "all":
+            formats_list = ["pdf", "png", "svg"]
+        elif "," in formats_str:
+            formats_list = [f.strip() for f in formats_str.split(",")]
+        else:
+            formats_list = [formats_str]
+    else:
+        raise TypeError(
+            f"Expected str, but got {type(formats).__name__}. Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'."
+        )
+
+    # Save the figure
+    if "pdf" in formats_list:
+        pdf_path = tmp_directory / f"{name}.pdf"
+        plt.savefig(pdf_path, bbox_inches="tight")
+        print(f"Saved pdf format into {pdf_path}")
+
+    if "png" in formats_list:
+        png_path = tmp_directory / f"{name}.png"
+        plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
+        print(f"Saved png format into {png_path}")
+
+    if "svg" in formats_list:
+        svg_path = tmp_directory / f"{name}.svg"
+        plt.savefig(svg_path, bbox_inches="tight", format="svg")
+        print(f"Saved svg format into {svg_path}")
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index 6bab8f2128d032e6760854e3f049c08544cfda19..697dee2b19da6aac7099f023802b6d658951c5f9 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -92,7 +92,8 @@ def LegalizeNegativeIndex():
 
 
 def InjectAssumes():
-    """Inject Assumes
+    """Inject Assumes for natural shape boundary conditions. And convert Assumes in Evaluate(Call(...)) form
+    (tvm builtin assume call) to AttrNode form.
 
     Returns:
     -------
@@ -110,8 +111,7 @@ def LowerHopperIntrin():
     fpass : tvm.transform.Pass
         The result pass
     """
-    return (_ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f
-           )  # type: ignore
+    return _ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f  # type: ignore
 
 
 def WarpSpecializedPipeline():
@@ -304,6 +304,21 @@ def SplitHostDevice():
     return _ffi_api.SplitHostDevice()  # type: ignore
 
 
+def AnnotateReadOnlyParams():
+    """Annotate read-only handle parameters for PrimFuncs.
+
+    Adds attribute `tl.readonly_param_indices` listing param indices that are
+    never written, enabling CUDA codegen to emit `const` qualifiers to unlock
+    read-only cache loads.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.AnnotateReadOnlyParams()  # type: ignore
+
+
 def VectorizeLoop(enable_vectorize: bool = True):
     """VectorizeLoop
 
@@ -365,8 +380,7 @@ def FlattenBuffer():
 
 
 def EliminateStorageSyncForMBarrier():
-    """EliminateStorageSyncForMBarrier
-    """
+    """EliminateStorageSyncForMBarrier"""
     return _ffi_api.EliminateStorageSyncForMBarrier()  # type: ignore
 
 
@@ -378,19 +392,16 @@ def MergeSharedMemoryAllocations(enable_aggressive_merge: bool = False, align_by
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge,
-                                                 align_bytes)  # type: ignore
+    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge, align_bytes)  # type: ignore
 
 
 def LowerL2Persistent():
-    """LowerL2Persistent
-    """
+    """LowerL2Persistent"""
     return _ffi_api.LowerL2Persistent()  # type: ignore
 
 
 def PersistThreadblock():
-    """PersistThreadblock
-    """
+    """PersistThreadblock"""
     return _ffi_api.PersistThreadblock()  # type: ignore
 
 
@@ -409,11 +420,25 @@ def AlignDynamicSharedMemoryAllocations(align_bytes: int = 16):
 
 
 def LowerSharedBarrier():
-    """LowerSharedBarrier
-    """
+    """LowerSharedBarrier"""
     return _ffi_api.LowerSharedBarrier()  # type: ignore
 
 
+def PlanAndUpdateBufferAllocationLocation():
+    """Plan and update buffer allocation locations within PrimFuncs.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.PlanAndUpdateBufferAllocationLocation()  # type: ignore
+
+
+def HoistNonRestrictParams():
+    return _ffi_api.HoistNonRestrictParams()  # type: ignore
+
+
 def StorageRewrite():
     """StorageRewrite
 
@@ -426,20 +451,17 @@ def StorageRewrite():
 
 
 def LowerOpaqueBlock():
-    """LowerOpaqueBlock
-    """
+    """LowerOpaqueBlock"""
     return _ffi_api.LowerOpaqueBlock()  # type: ignore
 
 
 def LowerThreadAllreduce():
-    """LowerThreadAllreduce
-    """
+    """LowerThreadAllreduce"""
     return _ffi_api.LowerThreadAllreduce()  # type: ignore
 
 
 def LowerIntrin():
-    """LowerIntrin
-    """
+    """LowerIntrin"""
     return _ffi_api.LowerIntrin()  # type: ignore
 
 
@@ -457,8 +479,7 @@ def LowerDeviceKernelLaunch():
 
 
 def LowerSharedTmem():
-    """LowerSharedTmem
-    """
+    """LowerSharedTmem"""
     return _ffi_api.LowerSharedTmem()  # type: ignore
 
 
diff --git a/tilelang/transform/add_bufstore_wrapper.py b/tilelang/transform/add_bufstore_wrapper.py
index 7ccab47076d6de212985233b61d2ce515de2e1e1..c1dd41e0dde17c7817416ea88fee9f63e17f1185 100644
--- a/tilelang/transform/add_bufstore_wrapper.py
+++ b/tilelang/transform/add_bufstore_wrapper.py
@@ -1,5 +1,4 @@
-from __future__ import annotations
-from tvm.tir import (BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm)
+from tvm.tir import BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm
 from tvm.tir.stmt_functor import ir_transform, post_order_visit
 from tvm.tir.transform import prim_func_pass
 
@@ -98,7 +97,7 @@ def AddWrapperForSingleBufStore():
             Returns:
                 True if the loop is a tile operation (parallel or has num_stages annotation)
             """
-            return loop.kind == ForKind.PARALLEL or 'num_stages' in loop.annotations
+            return loop.kind == ForKind.PARALLEL or "num_stages" in loop.annotations
 
         def pre_visit(statement):
             """
@@ -106,7 +105,7 @@ def AddWrapperForSingleBufStore():
             """
             nonlocal tile_operation_depth
 
-            if isinstance(statement, AttrStmt) and statement.attr_key == 'thread_extent':
+            if isinstance(statement, AttrStmt) and statement.attr_key == "thread_extent":
                 thread_binding_vars.add(statement.node.var)
             elif isinstance(statement, For) and is_tile_operation_loop(statement):
                 tile_operation_depth += 1
@@ -140,7 +139,8 @@ def AddWrapperForSingleBufStore():
                             if isinstance(index, IntImm) and index != 0:
                                 raise ValueError(
                                     f"Fragment buffer access with non-zero index [{index}] is not supported. "
-                                    "Only fragment[0] access is allowed.")
+                                    "Only fragment[0] access is allowed."
+                                )
 
                     # Wrap fragment[0] access with T.Parallel loop
                     return For(Var("_", "int32"), 0, 1, ForKind.PARALLEL, statement)
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index a1edb881de724deedcc820b5202755212b638225..8bb82106d4e69b5eeea16400b51ff8904031aa6e 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -5,16 +5,11 @@ from enum import Enum
 
 class PassConfigKey(str, Enum):
     """Pass configuration keys for TileLang compiler."""
+
     # TileLang specific configs
     TL_SIMPLIFY = "tl.Simplify"
     """Enable/disable TileLang simplification passes. Default: True"""
 
-    TL_DYNAMIC_ALIGNMENT = "tl.dynamic_alignment"
-    """Memory alignment requirement for dynamic shapes. Default: 16"""
-
-    TL_DISABLE_DYNAMIC_TAIL_SPLIT = "tl.disable_dynamic_tail_split"
-    """Disable dynamic tail splitting optimization. Default: False"""
-
     TL_DISABLE_WARP_SPECIALIZED = "tl.disable_warp_specialized"
     """Disable warp specialization optimization. Default: False"""
 
@@ -36,6 +31,20 @@ class PassConfigKey(str, Enum):
     TL_ENABLE_PTXAS_VERBOSE_OUTPUT = "tl.enable_ptxas_verbose_output"
     """Enable ptxas verbose output. Default: False"""
 
+    TL_DEVICE_COMPILE_FLAGS = "tl.device_compile_flags"
+    """Additional device compiler flags passed to nvcc/NVRTC.
+
+    Accepts either a string (parsed with shell-like splitting) or a list of
+    strings. Typical usage is to provide extra include paths, defines or
+    ptxas options, e.g.:
+
+    - "-I/opt/include -DMY_SWITCH=1 --ptxas-options=--verbose"
+    - ["-I/opt/include", "-DMY_SWITCH=1", "--ptxas-options=--verbose"]
+
+    These flags are appended to the compiler options used in the tvm_ffi
+    CUDA compile callback. Default: None
+    """
+
     TL_CONFIG_INDEX_BITWIDTH = "tl.config_index_bitwidth"
     """Bitwidth for configuration indices. Default: 32"""
 
@@ -69,6 +78,15 @@ class PassConfigKey(str, Enum):
     TL_FORCE_LET_INLINE = "tl.force_let_inline"
     """Force TileLang to inline let bindings during simplification. Default: False"""
 
+    TL_LAYOUT_VISUALIZATION_ENABLE = "tl.layout_visualization_enable"
+    """Enable layout inference visualization. Default: False"""
+
+    TL_LAYOUT_VISUALIZATION_FORMATS = "tl.layout_visualization_formats"
+    """Layout visualization formats.
+    Acceptable values: "pdf", "png", "svg", "all"
+
+    """
+
     TL_STORAGE_REWRITE_DETECT_INPLACE = "tl.storage_rewrite_detect_inplace"
     """Control StorageRewrite inplace detection.
 
@@ -76,10 +94,10 @@ class PassConfigKey(str, Enum):
     such as `dst[i] = f(src[i])`, avoiding implicit aliasing:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    write = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
-    write_buf = T.Buffer((1,), "int32", data=write, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    write = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
+    write_buf = T.Buffer((1,), T.int32, data=write, scope="local.var")
     write_buf[0] = read_buf[0] * 2
     f(write_buf[0])
     ```
@@ -89,8 +107,8 @@ class PassConfigKey(str, Enum):
     like:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
     read_buf[0] = read_buf[0] * 2
     f(read_buf[0])
     ```
diff --git a/tilelang/transform/simplify.py b/tilelang/transform/simplify.py
index 7e0c5062b7a35c511d6bbe2ec77c8a55d1d28c22..c5e577d036a3b4553d66c19d1b04d79108249cbf 100644
--- a/tilelang/transform/simplify.py
+++ b/tilelang/transform/simplify.py
@@ -51,7 +51,6 @@ def _Simplify(stmt: PrimFunc | IRModule, inline_let: bool = False) -> PrimFunc |
 
 # Decorator to simplify the output of a function
 def simplify_prim_func(func: Callable) -> Callable:
-
     def wrapper(*args, **kwargs):
         stmt: PrimFunc | IRModule = (func)(*args, **kwargs)
         return _Simplify(stmt)
diff --git a/tilelang/utils/__init__.py b/tilelang/utils/__init__.py
index e13905f82700be701762d6b13d97a3ce69d2ea4b..df0c71c2e3a5fdd8f80943b158ef75077a73c1d6 100644
--- a/tilelang/utils/__init__.py
+++ b/tilelang/utils/__init__.py
@@ -15,5 +15,7 @@ from .language import (
     retrive_ptr_from_buffer_region,  # noqa: F401
     is_full_region,  # noqa: F401
     to_buffer_region,  # noqa: F401
+    get_buffer_region_from_load,  # noqa: F401
+    get_prim_func_name,  # noqa: F401
 )
 from .deprecated import deprecated  # noqa: F401
diff --git a/tilelang/utils/deprecated.py b/tilelang/utils/deprecated.py
index 2aff08b59601049ac6fb53588ee3d0edd0d49bac..2944f292b308dd88ae2006433e96b09e0af5069e 100644
--- a/tilelang/utils/deprecated.py
+++ b/tilelang/utils/deprecated.py
@@ -1,11 +1,10 @@
 def deprecated_warning(method_name: str, new_method_name: str, phaseout_version: str = None):
-    """A function to indicate that a method is deprecated
-    """
+    """A function to indicate that a method is deprecated"""
     import warnings  # pylint: disable=import-outside-toplevel, import-error
 
     warnings.warn(
-        f"{method_name} is deprecated, use {new_method_name} instead" +
-        (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
+        f"{method_name} is deprecated, use {new_method_name} instead"
+        + (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
         DeprecationWarning,
         stacklevel=2,
     )
@@ -30,7 +29,6 @@ def deprecated(
     import functools  # pylint: disable=import-outside-toplevel
 
     def _deprecate(func):
-
         @functools.wraps(func)
         def _wrapper(*args, **kwargs):
             deprecated_warning(method_name, new_method_name, phaseout_version)
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index de180745036361a2f4214faa8b62472cb1527ae5..ea8e588043a49df5e2006920ca89233eb82b5eda 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 from tvm.tir import Buffer, BufferLoad, BufferRegion, PrimExpr
+from tilelang.language.utils import region as _make_region_call
 from functools import reduce
-from tvm import IRModule
+from tvm import IRModule, DataType
 from tvm.tir import PrimFunc
 from tvm import ir, tir
-
 # Scope Checkers for TVM Buffers
 # These utility functions check the memory scope of a given TVM buffer.
 
@@ -24,8 +24,7 @@ def _get_buffer(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) ->
     elif isinstance(buffer_or_load_or_region, (tir.BufferLoad, tir.BufferRegion)):
         return buffer_or_load_or_region.buffer
     else:
-        raise TypeError(
-            f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
 
 
 def is_global(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
@@ -153,13 +152,12 @@ def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     """
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
 
 
-def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> tir.BufferRegion | None:
+def get_buffer_region_from_load(buffer_load: tir.BufferLoad, extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
     """
     Get the buffer region from a buffer load.
 
@@ -170,45 +168,72 @@ def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> tir.BufferRegion
     buffer, indices = buffer_load.buffer, buffer_load.indices
     regions = []
     found_ramp: bool = False
-    for indice in indices:
+
+    if extents is not None:
+        assert len(extents) == len(indices), "extents should have the same length as indices"
+    for i, indice in enumerate(indices):
         if isinstance(indice, tir.Ramp):
+            assert extents is None, "extents should be provided for BufferLoad with Ramp indices"
             regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
             found_ramp = True
         elif isinstance(indice, tir.PrimExpr):
-            regions.append(ir.Range.from_min_extent(indice, 1))
+            if extents is not None:
+                regions.append(ir.Range.from_min_extent(indice, extents[i]))
+                found_ramp = True
+            else:
+                regions.append(ir.Range.from_min_extent(indice, 1))
         else:
-            raise ValueError("Unsupported type: ", type(indice))
+            raise ValueError(f"Unsupported type: {type(indice)} for index {i}")
     if found_ramp:
         return tir.BufferRegion(buffer, regions)
     else:
         return None
 
 
-def to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+def to_buffer_region(
+    obj: Buffer | BufferLoad | BufferRegion | tir.Var, access_type: str = "rw", extents: list[PrimExpr] | None = None
+) -> PrimExpr | BufferRegion:
     """
-    Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+    Convert to/from the tl.region representation.
 
-    - Buffer -> full-region BufferRegion covering entire shape
-    - BufferRegion -> returned as-is
-    - BufferLoad -> best-effort convert via get_buffer_region_from_load;
-      if scalar, fall back to 1-sized ranges at given indices
+    - Buffer/BufferLoad/BufferRegion -> returns a tl.region call (PrimExpr)
+    - tl.region Call -> returns the decoded BufferRegion for analysis
     """
+    from tilelang.language.frame import has_let_value, get_let_value
+
+    if isinstance(obj, tir.Var) and has_let_value(obj):
+        obj = get_let_value(obj)
+    # Encode into tl.region call (when extents is provided), otherwise return BufferRegion for analysis
     if isinstance(obj, tir.BufferRegion):
-        return obj
+        if extents is None:
+            return obj
+        mins = [r.min for r in obj.region]
+        exts = [r.extent for r in obj.region]
+        assert len(extents) == len(exts)
+        exts = [tir.min(exts[i], extents[i]) for i in range(len(exts))]
+        return _make_region_call(tir.BufferLoad(obj.buffer, mins), access_type, *exts)
     if isinstance(obj, tir.Buffer):
         mins = [tir.IntImm("int32", 0) for _ in obj.shape]
-        ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
-        return tir.BufferRegion(obj, ranges)
+        if extents is None:
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return tir.BufferRegion(obj, ranges)
+        exts = list(extents)
+        return _make_region_call(tir.BufferLoad(obj, mins), access_type, *exts)
     if isinstance(obj, tir.BufferLoad):
-        region = get_buffer_region_from_load(obj)
-        if region is not None:
-            return region
-        # Fallback: scalar load -> 1-sized ranges at indices
-        mins = [idx for idx in obj.indices]
-        ones = [tir.IntImm("int32", 1) for _ in obj.indices]
-        ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
-        return tir.BufferRegion(obj.buffer, ranges)
-    raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+        if extents is None:
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return tir.BufferRegion(obj.buffer, ranges)
+        exts = list(extents)
+        if len(obj.indices) > len(exts):
+            exts = [tir.IntImm("int32", 1) for _ in range(len(obj.indices) - len(exts))] + exts
+        assert len(obj.indices) == len(exts)
+        return _make_region_call(obj, access_type, *exts)
+    raise ValueError(f"Unsupported argument type for to_buffer_region: {type(obj)}")
 
 
 def retrieve_shape(obj: Buffer | BufferRegion | BufferLoad) -> list:
@@ -252,8 +277,7 @@ def retrieve_stride(obj: Buffer | BufferRegion | BufferLoad) -> list:
     return strides
 
 
-def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion,
-                                   access_type: str = "r") -> PrimExpr:
+def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion, access_type: str = "r") -> PrimExpr:
     if isinstance(buffer_or_load_or_region, Buffer):
         return buffer_or_load_or_region.access_ptr(access_type)
     elif isinstance(buffer_or_load_or_region, BufferLoad):
@@ -349,6 +373,17 @@ def retrieve_offset(obj: Buffer | BufferRegion | BufferLoad) -> list:
     raise ValueError(f"Unsupported retrieve_offset argument type: {type(obj)} for object {obj}")
 
 
+def bits_product(shape: list[PrimExpr], dtype: str) -> PrimExpr:
+    """
+    Compute the number of bits in a Buffer (shape with dtype)."""
+    if len(shape) == 0:
+        return tir.IntImm("int32", 1)
+    result = shape[0]
+    for i in range(1, len(shape)):
+        result = result * shape[i]
+    return result * DataType(dtype).bits
+
+
 def prim_expr_equal(lhs, rhs) -> bool:
     """
     Robust equality for PrimExpr shapes/extents.
@@ -443,3 +478,27 @@ def is_full_region(buffer_region: BufferRegion) -> bool:
         if not expr_equal(r.extent, dim):
             return False
     return True
+
+
+def get_prim_func_name(func: PrimFunc | None, default: str | None = None) -> str | None:
+    """
+    Extract a human‑readable function name from a TVM PrimFunc.
+
+    Prefer the `global_symbol` attribute set on the PrimFunc. If it is missing
+    (e.g., private PrimFunc without a global symbol), return the provided
+    `default` value.
+
+    Args:
+        func: TVM PrimFunc instance or None.
+        default: Fallback name to return when no name can be determined.
+
+    Returns:
+        The function name as a string, or `default` when unavailable.
+    """
+    if func is None:
+        return default
+    try:
+        name = func.attrs["global_symbol"]
+        return str(name) if name is not None else default
+    except Exception:
+        return default
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index cd364b8bb554fc90ef2ba823c6c4fde503b01f85..fa227b07dd503bceccbf0f142c86a9383b68fbf6 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -3,25 +3,27 @@ import os
 import torch
 import warnings
 from tilelang.contrib import nvcc
+from tilelang.utils.tensor import is_float8_dtype, fp8_remove_negative_zeros_
 from torch.utils.cpp_extension import load, _import_module_from_library
 from tilelang import env
 
+# Include version information to ensure different versions use separate caches
+from tilelang import __version__
+
 # Define paths
 compress_util = os.path.join(env.TILELANG_TEMPLATE_PATH, "tl_templates/cuda/compress_sm90.cu")
 # Cache directory for compiled extensions
-_CACHE_DIR = os.path.join(env.TILELANG_CACHE_DIR, "sparse_compressor")
+_CACHE_DIR = os.path.join(env.TILELANG_CACHE_DIR, "sparse_compressor", __version__)
 os.makedirs(_CACHE_DIR, exist_ok=True)
 
 
 def _get_cached_lib():
-    name = 'compress_lib'
-    cached_path = os.path.join(_CACHE_DIR, f"{name}.so")
+    name = "compress_lib"
 
-    if os.path.exists(cached_path):
+    if os.path.exists(os.path.join(_CACHE_DIR, f"{name}.so")):
         try:
-            return _import_module_from_library(name, cached_path)
+            return _import_module_from_library(name, _CACHE_DIR, is_python_module=True)
         except Exception:
-            # If loading fails, recompile
             pass
 
     # Set TORCH_CUDA_ARCH_LIST
@@ -32,24 +34,22 @@ def _get_cached_lib():
         name=name,
         sources=[compress_util],
         extra_cuda_cflags=[
-            '-O2',
-            '-std=c++17',
-            '-lineinfo',
-            f'-I{env.CUTLASS_INCLUDE_DIR}',
-            f'-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include',
-            '-arch=sm_90',
+            "-O2",
+            "-std=c++17",
+            "-lineinfo",
+            f"-I{env.CUTLASS_INCLUDE_DIR}",
+            f"-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include",
+            "-arch=sm_90",
         ],
         build_directory=_CACHE_DIR,
     )
 
 
-def compress_sm90(A: torch.Tensor, block_k: int,
-                  transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
+def compress_sm90(A: torch.Tensor, block_k: int, transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
     if block_k > 128:
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
-        warnings.warn(
-            f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
+        warnings.warn(f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
     # Load the library (will use cache if available)
     compress_lib = _get_cached_lib()
 
@@ -60,8 +60,9 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
     try:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
     except ImportError as err:
-        raise ImportError("SparseSemiStructuredTensor is not available in this version of PyTorch. "
-                          "Please install a compatible version.") from err
+        raise ImportError(
+            "SparseSemiStructuredTensor is not available in this version of PyTorch. Please install a compatible version."
+        ) from err
     orig_val = SparseSemiStructuredTensor._FORCE_CUTLASS
     try:
         SparseSemiStructuredTensor._FORCE_CUTLASS = True
@@ -73,10 +74,7 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
         SparseSemiStructuredTensor._FORCE_CUTLASS = orig_val
 
 
-def compress(A: torch.Tensor,
-             transposed: bool,
-             arch: str | None = None,
-             **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+def compress(A: torch.Tensor, transposed: bool, arch: str | None = None, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Compress a tensor using the appropriate method based on the CUDA architecture.
     """
@@ -88,13 +86,23 @@ def compress(A: torch.Tensor,
     if compute_version >= (9, 0):
         return compress_sm90(A, transposed=transposed, **kwargs)
     elif compute_version >= (8, 0):
-        return compress_sm80(A, transposed=transposed)
+        if transposed:
+            A = A.t().contiguous()
+        origin_dtype = A.dtype
+        if is_float8_dtype(origin_dtype):
+            fp8_remove_negative_zeros_(A)
+            A = A.view(torch.int8)
+        A_sp, E = compress_sm80(A, transposed=False)
+        if is_float8_dtype(origin_dtype):
+            A_sp = A_sp.view(origin_dtype)
+        if transposed:
+            A_sp = A_sp.t().contiguous()
+        return A_sp, E
     else:
-        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. "
-                         "Supported versions are sm_80 and sm_90.")
+        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. Supported versions are sm_80 and sm_90.")
 
 
-def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transposed: bool = False):
+def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a random semi-sparse tensor. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -105,6 +113,8 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.randn((M, K), dtype=torch.float, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
@@ -114,11 +124,31 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
     return tensor.to(dtype)  # dtype like float8 might not have randn kernel
 
 
-def arange_semi_sparse(M: int,
-                       K: int,
-                       dtype=torch.float16,
-                       device='cuda',
-                       transposed: bool = False):
+def randint_semi_sparse(M: int, K: int, low: int, high: int, dtype=torch.int32, device="cuda", transposed: bool = False):
+    """
+    Generate a random semi-sparse integer tensor. The generated tensor will have 2:4 sparsity along the K dimension.
+    Args:
+        M (int): Number of rows
+        K (int): Number of columns
+        low (int): Lower bound of the random integers
+        high (int): Upper bound of the random integers
+        dtype: Data type of the tensor
+        device: Device to create the tensor on
+        transposed (bool): If True, returns a transposed tensor of shape (K, M)
+    """
+    elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
+    tensor = torch.randint(low, high, (M, K), dtype=dtype, device=device).view(M, -1, group)
+    indice = tensor.topk(elem, dim=-1).indices
+    tensor.scatter_(-1, indice, 0)
+    tensor = tensor.view(M, K)
+    if transposed:
+        tensor = tensor.t().contiguous()
+    return tensor
+
+
+def arange_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a semi-sparse tensor with values from 0 to M*K-1. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -129,6 +159,8 @@ def arange_semi_sparse(M: int,
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.arange(M * K, dtype=dtype, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py
index 094c099fe37ca8d68c36478d426c2bd580e7acb9..a2b88f5e8c7fbe9fb1f10642924fb05206621971 100644
--- a/tilelang/utils/target.py
+++ b/tilelang/utils/target.py
@@ -15,6 +15,7 @@ SUPPORTED_TARGETS: dict[str, str] = {
     "llvm": "LLVM CPU target (accepts standard TVM LLVM options).",
     "webgpu": "WebGPU target for browser/WebGPU runtimes.",
     "c": "C source backend.",
+    "cutedsl": "CuTe DSL GPU target.",
 }
 
 
@@ -56,11 +57,10 @@ def check_metal_availability() -> bool:
     if not mac_release:
         return False
     # todo: check torch version?
-    return arch == 'arm64'
+    return arch == "arm64"
 
 
-def determine_target(target: str | Target | Literal["auto"] = "auto",
-                     return_object: bool = False) -> str | Target:
+def determine_target(target: str | Target | Literal["auto"] = "auto", return_object: bool = False) -> str | Target:
     """
     Determine the appropriate target for compilation (CUDA, HIP, or manual selection).
 
@@ -96,6 +96,14 @@ def determine_target(target: str | Target | Literal["auto"] = "auto",
             return_var = "metal"
         else:
             raise ValueError("No CUDA or HIP or MPS available on this system.")
+    elif isinstance(target, str) and target.startswith("cutedsl"):
+        cuda_target_str = target.replace("cutedsl", "cuda", 1)
+        temp_target = Target(cuda_target_str)
+
+        target_dict = dict(temp_target.export())
+        target_dict["keys"] = list(target_dict["keys"]) + ["cutedsl"]
+
+        return_var = Target(target_dict)
     else:
         # Validate the target if it's not "auto"
         if isinstance(target, Target):
@@ -116,6 +124,8 @@ def determine_target(target: str | Target | Literal["auto"] = "auto",
         else:
             raise AssertionError(f"Target {target} is not supported")
 
+    if isinstance(return_var, Target):
+        return return_var
     if return_object:
         if isinstance(return_var, Target):
             return return_var
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index 51f63db4a3c528d2d8e047e81a3b5cf3eeb267bd..13ce194fb296e34f26a3c37de803e9ae93aed5a0 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -1,13 +1,27 @@
-from __future__ import annotations
 """The profiler and convert to torch utils"""
+
 from enum import Enum
 import torch
-from tvm import runtime
 from tvm import tir
-from torch.utils.dlpack import to_dlpack
 import numpy as np
 
 
+def is_float8_dtype(dtype: torch.dtype) -> bool:
+    return dtype in {
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+    }
+
+
+def fp8_remove_negative_zeros_(tensor: torch.Tensor):
+    assert is_float8_dtype(tensor.dtype), "Input tensor must be of float8 dtype"
+    bits = tensor.view(torch.uint8)
+    zeros_mask = tensor == 0
+    bits[zeros_mask] = 0x00
+
+
 class TensorSupplyType(Enum):
     Integer = 1
     Uniform = 2
@@ -18,56 +32,53 @@ class TensorSupplyType(Enum):
     Auto = 7
 
 
-def map_torch_type(intype: str) -> torch.dtype:
+def map_torch_type(intype) -> torch.dtype:
+    # Convert to string if needed
+    if not isinstance(intype, str):
+        intype = str(intype)
+
     if intype == "float8_e4m3":
-        assert hasattr(torch, "float8_e4m3fn"), \
-            "torch.float8_e4m3fn is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
+        assert hasattr(torch, "float8_e4m3fn"), "torch.float8_e4m3fn is not supported in this version of torchPlease upgrade torch >= 2.1.0"
         return torch.float8_e4m3fn
     elif intype == "float8_e5m2":
-        assert hasattr(torch, "float8_e5m2"), \
-            "torch.float8_e5m2 is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torchPlease upgrade torch >= 2.1.0"
         return torch.float8_e5m2
     elif intype == "e4m3fnuz_float8":
-        assert hasattr(torch, "float8_e4m3fnuz"), \
-            "torch.float8_e4m3fnuz is not supported in this version of torch" \
-                "Please upgrade torch >= 2.2.0"
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torchPlease upgrade torch >= 2.2.0"
+        )
         return torch.float8_e4m3fnuz
+    elif intype == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif intype == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fnx2
+    elif "float4" in intype:
+        # PyTorch doesn't support float4, use int8 as storage type
+        return torch.int8
     else:
         return getattr(torch, intype)
 
 
-def adapt_torch2tvm(arg):
-    float8_dtype_map = {
-        torch.float8_e4m3fn: "float8_e4m3",
-        torch.float8_e4m3fnuz: "float8_e4m3",
-        torch.float8_e5m2: "float8_e5m2",
-        torch.float8_e5m2fnuz: "float8_e5m2",
-    }
-    if isinstance(arg, torch.Tensor):
-        if arg.dtype in {
-                torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz
-        }:
-            return runtime.from_dlpack(to_dlpack(arg.view(torch.int8)))._create_view(
-                shape=arg.shape, dtype=float8_dtype_map[arg.dtype])
-        return runtime.from_dlpack(to_dlpack(arg))
-    return arg
-
-
 def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
-
     from tilelang.engine.param import KernelParam
     from .device import get_current_device
 
     def get_tensor(param: KernelParam) -> torch.Tensor:
-        dtype: torch.dtype = param.dtype
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        dtype: torch.dtype = param.torch_dtype()
         device = get_current_device()
 
         if hasattr(param, "shape") and not param.shape:
             raise ValueError(
                 f"TensorType must have a shape, but got {type(param)}, "
-                "likely you are trying to generate a random tensor with a dynamic symbolic shape.")
+                "likely you are trying to generate a random tensor with a dynamic symbolic shape."
+            )
 
         # Check if with dynamic symbolic shape
         for shape in param.shape:
@@ -81,12 +92,14 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
         if supply_type == TensorSupplyType.Auto:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             elif dtype in {torch.float16, torch.float32, torch.bfloat16}:
@@ -95,30 +108,30 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
 
         if dtype == torch.int8 and supply_type in [
-                TensorSupplyType.Uniform,
-                TensorSupplyType.Normal,
+            TensorSupplyType.Uniform,
+            TensorSupplyType.Normal,
         ]:
             return torch.ones(*shape, device=device, dtype=dtype)
 
         if supply_type == TensorSupplyType.Integer:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             else:
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
         elif supply_type == TensorSupplyType.Uniform:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Normal:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Randn:
             return torch.randn(*shape, device=device).to(dtype)
         elif supply_type == TensorSupplyType.Zero:
@@ -154,9 +167,7 @@ def _compare_attributes(
     """
 
     def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
-        raise AssertionError(
-            f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}."
-        )
+        raise AssertionError(f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}.")
 
     if actual.shape != expected.shape:
         raise_mismatch_error("shape", actual.shape, expected.shape)
@@ -167,7 +178,7 @@ def _compare_attributes(
     if actual.layout != expected.layout:
         if check_layout:
             raise_mismatch_error("layout", actual.layout, expected.layout)
-    elif (actual.layout == torch.strided and check_stride and actual.stride() != expected.stride()):
+    elif actual.layout == torch.strided and check_stride and actual.stride() != expected.stride():
         raise_mismatch_error("stride()", actual.stride(), expected.stride())
     if check_device and actual.device != expected.device:
         raise_mismatch_error("device", actual.device, expected.device)
@@ -175,8 +186,7 @@ def _compare_attributes(
         raise_mismatch_error("dtype", actual.dtype, expected.dtype)
 
 
-def _equalize_attributes(actual: torch.Tensor,
-                         expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def _equalize_attributes(actual: torch.Tensor, expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """Equalizes some attributes of two tensors for value comparison.
     If ``actual`` and ``expected`` are ...
     - ... not on the same :attr:`~torch.Tensor.device`, they are moved CPU memory.
@@ -214,7 +224,7 @@ def _equalize_attributes(actual: torch.Tensor,
     if actual.layout != expected.layout:
         # These checks are needed, since Tensor.to_dense() fails on tensors that are already strided
         actual = actual.to_dense() if actual.layout != torch.strided else actual
-        expected = (expected.to_dense() if expected.layout != torch.strided else expected)
+        expected = expected.to_dense() if expected.layout != torch.strided else expected
     return actual, expected
 
 
@@ -258,12 +268,8 @@ def torch_assert_close(
     """
 
     _compare_attributes(
-        tensor_a,
-        tensor_b,
-        check_device=check_device,
-        check_dtype=check_dtype,
-        check_layout=check_layout,
-        check_stride=check_stride)
+        tensor_a, tensor_b, check_device=check_device, check_dtype=check_dtype, check_layout=check_layout, check_stride=check_stride
+    )
     tensor_a, tensor_b = _equalize_attributes(tensor_a, tensor_b)
 
     mismatched = ~torch.isclose(tensor_a, tensor_b, rtol=rtol, atol=atol, equal_nan=equal_nan)
@@ -280,8 +286,7 @@ def torch_assert_close(
 
     # Print debug information about the mismatch
     if verbose:
-        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} "
-              f"(allowed: {max_allowed_mismatched})")
+        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} (allowed: {max_allowed_mismatched})")
 
     # If there are mismatched elements, print the first mismatch
     if num_mismatched > 0:
@@ -293,9 +298,9 @@ def torch_assert_close(
         b_val = tensor_b.reshape(-1)[flat_idx].item()
         abs_diff = abs(a_val - b_val)
         rel_diff = abs_diff / (abs(b_val) + 1e-12)
-        mismatch_info = (f"\nFirst mismatch at index {idx}: "
-                         f"lhs={a_val:.6f}, rhs={b_val:.6f}, "
-                         f"abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}")
+        mismatch_info = (
+            f"\nFirst mismatch at index {idx}: lhs={a_val:.6f}, rhs={b_val:.6f}, abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}"
+        )
     else:
         mismatch_info = ""
 
@@ -308,6 +313,7 @@ def torch_assert_close(
             f"\nGreatest absolute difference: {diff.max().item()}, "
             f"Greatest relative difference: {(diff / (torch.abs(tensor_b) + 1e-12)).max().item()}"
             f"\n{base_name}: {tensor_a}"
-            f"\n{ref_name}: {tensor_b}")
+            f"\n{ref_name}: {tensor_b}"
+        )
     else:
         return True
diff --git a/version_provider.py b/version_provider.py
index 3eb45aac90116c60747390199dd39961ea8d1172..c2ca929ae60d4c800a267aa4c51cbf55a450617c 100644
--- a/version_provider.py
+++ b/version_provider.py
@@ -8,29 +8,26 @@ from functools import lru_cache
 
 ROOT = Path(__file__).parent
 
-base_version = (ROOT / 'VERSION').read_text().strip()
+base_version = (ROOT / "VERSION").read_text().strip()
 # When installing a sdist,
 # the installed version needs to match the sdist version,
 # so pip will complain when we install `tilelang-0.1.6.post2+gitxxxx.tar.gz`.
 # To workaround that, when building sdist,
 # we do not add version label and use a file to store the git hash instead.
-git_pin = ROOT / '.git_commit.txt'
+git_pin = ROOT / ".git_commit.txt"
 
 
 def _read_cmake_bool(i: str | None, default=False):
     if i is None:
         return default
-    return i.lower() not in ('0', 'false', 'off', 'no', 'n', '')
+    return i.lower() not in ("0", "false", "off", "no", "n", "")
 
 
 @lru_cache(maxsize=1)
 def get_git_commit_id() -> str | None:
     """Get the current git commit hash by running git in the current file's directory."""
 
-    r = subprocess.run(['git', 'rev-parse', 'HEAD'],
-                       cwd=ROOT,
-                       capture_output=True,
-                       encoding='utf-8')
+    r = subprocess.run(["git", "rev-parse", "HEAD"], cwd=ROOT, capture_output=True, encoding="utf-8")
     if r.returncode == 0:
         _git = r.stdout.strip()
         git_pin.write_text(_git)
@@ -41,51 +38,48 @@ def get_git_commit_id() -> str | None:
         return None
 
 
-def dynamic_metadata(
-    field: str,
-    settings: dict[str, object] | None = None,
-) -> str:
-    assert field == 'version'
+def dynamic_metadata(field: str, settings: dict[str, object] | None = None) -> str:
+    assert field == "version"
 
     version = base_version
 
     # generate git version for sdist
     get_git_commit_id()
 
-    if not _read_cmake_bool(os.environ.get('NO_VERSION_LABEL')):
+    if not _read_cmake_bool(os.environ.get("NO_VERSION_LABEL")):
         exts = []
         backend = None
-        if _read_cmake_bool(os.environ.get('NO_TOOLCHAIN_VERSION')):
+        if _read_cmake_bool(os.environ.get("NO_TOOLCHAIN_VERSION")):
             pass
-        elif platform.system() == 'Darwin':
+        elif platform.system() == "Darwin":
             # only on macosx_11_0_arm64, not necessary
             # backend = 'metal'
             pass
-        elif _read_cmake_bool(os.environ.get('USE_ROCM', '')):
-            backend = 'rocm'
-        elif 'USE_CUDA' in os.environ and not _read_cmake_bool(os.environ.get('USE_CUDA')):
-            backend = 'cpu'
+        elif _read_cmake_bool(os.environ.get("USE_ROCM", "")):
+            backend = "rocm"
+        elif "USE_CUDA" in os.environ and not _read_cmake_bool(os.environ.get("USE_CUDA")):
+            backend = "cpu"
         else:  # cuda
             # Read nvcc version from env.
             # This is not exactly how it should be,
             # but works for now if building in a nvidia/cuda image.
-            if cuda_version := os.environ.get('CUDA_VERSION'):
-                major, minor, *_ = cuda_version.split('.')
-                backend = f'cu{major}{minor}'
+            if cuda_version := os.environ.get("CUDA_VERSION"):
+                major, minor, *_ = cuda_version.split(".")
+                backend = f"cu{major}{minor}"
             else:
-                backend = 'cuda'
+                backend = "cuda"
         if backend:
             exts.append(backend)
 
-        if _read_cmake_bool(os.environ.get('NO_GIT_VERSION')):
+        if _read_cmake_bool(os.environ.get("NO_GIT_VERSION")):
             pass
         elif git_hash := get_git_commit_id():
-            exts.append(f'git{git_hash[:8]}')
+            exts.append(f"git{git_hash[:8]}")
         else:
-            exts.append('gitunknown')
+            exts.append("gitunknown")
 
         if exts:
-            version += '+' + '.'.join(exts)
+            version += "+" + ".".join(exts)
 
     return version