[Lint] Add ruff config to check for useless spaces (#807)

* update lint config * Remove spaces for blank line * update

[Lint] Add ruff config to check for useless spaces (#807)
* update lint config * Remove spaces for blank line * update
5e529522 · Yichen Yan · GitHub · 4d54854b · 5e529522 · 5e529522
Unverified Commit 5e529522 authored Sep 13, 2025 by Yichen Yan Committed by GitHub Sep 13, 2025
20 changed files
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -32,7 +32,7 @@ def ref_program(A, B):
 def get_configs(args, kwargs):
    """
    Generate a list of configuration dictionaries that will be used for tuning.
    Parameters
    ----------
    with_roller : bool

--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -165,7 +165,7 @@ def ref_program(A, B):
 def get_configs(args, kwargs):
    """
    Generate a list of configuration dictionaries that will be used for tuning.
    Parameters
    ----------
    with_roller : bool

--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
@@ -35,7 +35,7 @@ def ref_program(A, B):
 def get_configs(M, N, K):
    """
    Generate a list of configuration dictionaries that will be used for tuning.
    Parameters
    ----------
    with_roller : bool

--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -33,7 +33,7 @@ def ref_program(A, B):
 def get_configs(args, kwargs):
    """
    Generate a list of configuration dictionaries that will be used for tuning.
    Parameters
    ----------
    with_roller : bool

--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -84,12 +84,12 @@ def bitnet_158_int8xint2_prefill(
 ):
    """
    Create a TVM GPU prim_func implementing a block-tiled matrix multiply that multiplies dense A by compressed/interleaved low‑precision B (2-bit packed into int8 storage), decoding B to int8 on-chip and accumulating into C.
    The returned prim_func expects:
    - A: shape (M, K) with dtype `in_dtype` ("float16" or "int8").
    - B: compressed storage with shape (N, K/4) and int8 storage layout (packing 4 2-bit elements per byte).
    - C: output buffer shape (M, N) with dtype `out_dtype` ("float16", "float32", or "int32").
    Details:
    - Builds a tiled, pipelined kernel using shared memory and warp-level MMA intrinsics (INT4TensorCoreIntrinEmitter). B is loaded from compressed storage, decoded to int8 in threads (via decode_i2u_to_i8s / decode_i2s_to_i8s), and dequantized into a shared buffer used by the MMA emitter.
    - Tiling parameters:
@@ -99,7 +99,7 @@ def bitnet_158_int8xint2_prefill(
      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == "int32").
    - Uses 2-stage pipelining by default to overlap loads and compute and applies a swizzle layout to improve L2 behavior.
    - Assertions: raises AssertionError if in_dtype or out_dtype are not among supported values.
    Parameters:
        M, N, K (int): Global matrix dimensions.
        in_dtype (str): Input and decoded B element dtype; "float16" or "int8".
@@ -111,7 +111,7 @@ def bitnet_158_int8xint2_prefill(
        warp_row_tiles (int): Tiles per warp in row dimension.
        warp_col_tiles (int): Tiles per warp in column dimension.
        chunk (int): K-length per block (block_K).
    Returns:
        T.prim_func: A TVM prim_func implementing the described GPU kernel suitable for compilation and execution.
    """
@@ -187,18 +187,18 @@ def bitnet_158_int8xint2_prefill(
    ):
        """
            GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
            This kernel:
            - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
            - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
            - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
            - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
            Parameters:
                A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
                B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
                C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
            Side effects:
                Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
        """

--- a/examples/bitnet-1.58b/vllm_workspace/utils.py
+++ b/examples/bitnet-1.58b/vllm_workspace/utils.py
@@ -6,7 +6,7 @@ TokensText = Tuple[List[int], str]
 def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText],
                        name_0: str, name_1: str):
    """
-    Compare the two sequences generated by different models, 
+    Compare the two sequences generated by different models,
    which should be equal.
    """
    assert len(outputs_0_lst) == len(outputs_1_lst)

--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
@@ -10,15 +10,15 @@ from utils import torch_convert_bit_twiddling, torch_convert
 def get_configs():
    """
    Return a list of tuning configuration dictionaries for the autotuned matmul kernel.
    Each dictionary is a single combination (Cartesian product) of the following parameters:
    - block_M: tile size for M dimension (one of 64, 128, 256)
    - block_N: tile size for N dimension (one of 64, 128, 256)
-    - block_K: tile size for K dimension 
+    - block_K: tile size for K dimension
    - num_stages: pipeline stages for K-loop (0 or 2)
    - threads: number of threads to launch (128, 256, or 512)
    - split: K-splitting factor (1 or 2)
    Returns:
        list[dict]: List of configuration dicts usable by the autotuner, where each dict maps
        the parameter name to its chosen value.
@@ -62,30 +62,30 @@ def matmul(M,
           split=1):
    """
           Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
           This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
           - A: dense input of shape (M, K) with dtype `in_dtype`.
           - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
           - C: output of shape (M, N) with dtype `out_dtype`.
           The generated kernel supports two dequantization paths:
           - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
           - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
           Important behavior and requirements:
           - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
           - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
           - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
           - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
           - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
           Parameters that alter kernel layout/behavior (brief):
           - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
           - num_stages: number of software pipeline stages for the K-loop.
           - threads: number of threads used per kernel block.
           - split: extra K-splitting factor; K must be divisible by block_K * split.
           - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
           Returns:
               A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
           """
@@ -124,12 +124,12 @@ def matmul(M,
    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
        """
        Create a TileLang macro that performs fast, twiddling-based dequantization from packed FP4 to BF16 using an external runtime plugin.
        This function validates the requested input/output datatypes and returns a TileLang `@T.macro` named `fast_dequant_bf16_fp4_twiddling` which:
        - Loads compressed FP4 bytes from a shared buffer into per-thread local registers (vectorized loads).
        - Invokes an external dequantization routine (via `T.call_extern`) to expand the packed FP4 values into BF16 in registers.
        - Writes the dequantized BF16 values back to a shared dequantized buffer for use by the kernel.
        Notes and preconditions:
        - Asserts that `in_dtype == "fp4"` and `out_dtype == "bfloat16"`.
        - The generated macro depends on several surrounding-scope symbols (e.g., `import_source`, `func_name`, `block_K`, `Block_QK`, `threads`, `num_elems_per_byte`, `storage_dtype`, and `out_dtype`) and expects them to be defined consistently in the enclosing kernel.
@@ -149,17 +149,17 @@ def matmul(M,
            # import fast_dequantize plugin
            """
            Fast dequantization kernel routine that converts packed FP4 values in shared memory to BF16 and writes the results back into a shared dequantized buffer.
            This function is intended to run inside a tiled GPU kernel: each thread loads a small packed segment from the quantized shared buffer `B_shared` into a per-thread local register buffer, calls an external dequantization routine (provided by the runtime plugin imported from `import_source` and identified by `func_name`) to expand the packed values to BF16 in a per-thread local output buffer, and stores the expanded values into `B_dequantize_shared`. It performs vectorized per-thread loads and stores and is sized according to the surrounding kernel's tiling and threading parameters.
            Parameters:
                B_shared: Shared-memory buffer containing packed quantized values (packed FP4 layout).
                B_dequantize_shared: Shared-memory buffer to receive dequantized BF16 values (written in-place by this routine).
            Side effects:
                - Imports the external dequantization plugin via `import_source` and invokes `func_name`.
                - Writes dequantized BF16 results into `B_dequantize_shared`.
            Notes:
                - This routine expects the surrounding kernel to define and provide the tiling/threading constants (e.g., thread count, local buffer sizes, block dimensions) and the runtime plugin identifiers (`import_source`, `func_name`).
                - No value is returned; results are produced by mutation of `B_dequantize_shared`.
@@ -197,18 +197,18 @@ def matmul(M,
    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
        """
        Create a simple TIR dequantization macro that converts packed 4-bit FP (FP4) stored in uint8 into bfloat16.
        The returned macro (named `simple_dequant_bf16_fp4`) expects B_shared and B_dequantize_shared buffers (shapes and a few loop/constant names like
        `B_shared_shape`, `B_dequantize_shared_shape`, `storage_dtype`, `out_dtype`, `num_bits`, `num_elems_per_byte`, `block_N`, and `block_K`) to be available in the surrounding TIR scope. It:
        - Unpacks 4-bit FP values from the packed uint8 representation in B_shared.
        - Converts each 4-bit value to a bfloat16 element using an internal helper `_tir_u8_to_f4_to_bf16`.
        - Writes the dequantized bfloat16 block into B_dequantize_shared.
        Constraints:
        - Supports only in_dtype="fp4" and out_dtype="bfloat16".
        - The helper assumes nbit == 4 and produces bfloat16 values.
        - The macro uses a fixed test-scale of 0 (no per-element scaling) as written.
        Returns:
            A TIR macro function performing the described in-place block dequantization from packed uint8 FP4 to bfloat16.
        """
@@ -219,22 +219,22 @@ def matmul(M,
                                  scale: tir.PrimExpr, dtype: str):
            """
                Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
                This helper extracts the 4-bit field located at the bit position `pos` within the
                byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
                exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
                resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
                Parameters:
                    nbit (int): Number of bits in the packed element; must be 4.
                    val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
                    pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
                    scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
                    dtype (str): Target dtype string; must be "bfloat16".
                Returns:
                    tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
                Notes:
                    - The function asserts `nbit == 4`, `dtype == "bfloat16"`, and that `val.dtype` is "uint8".
                    - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
@@ -262,16 +262,16 @@ def matmul(M,
        def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared):
            """
            Dequantize a packed FP4 uint8 shared buffer into BF16 and store the result into a shared dequantized buffer.
            This helper:
            - Loads B_shared into a local fragment, converts each packed FP4 element to BF16 using `_tir_u8_to_f4_to_bf16`, and writes the dequantized values into B_dequantize_shared.
            - Iterates in parallel over the logical block columns (block_N) and block_K, unpacking elements from bytes using `num_elems_per_byte`.
            - Uses a fixed scale of 0 in the conversion (placeholder for testing); `num_bits` and `num_elems_per_byte` are expected to be available from the enclosing scope.
            Parameters:
                B_shared: shared-memory buffer containing packed FP4 data (uint8-packed).
                B_dequantize_shared: shared-memory buffer to receive BF16 dequantized values.
            Side effects:
                Writes dequantized BF16 values into B_dequantize_shared. No return value.
            """
@@ -298,7 +298,7 @@ def matmul(M,
    ):
        """
            Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
            This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
            - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
            - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
@@ -307,16 +307,16 @@ def matmul(M,
              - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
              - Performs a GEMM accumulating into C_local with B transposed.
            - Stores the accumulated block from C_local back to the global output C via C_shared.
            Parameters:
            - A: input tile of shape (M, K) with dtype `in_dtype`.
            - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
            - C: output tensor of shape (M, N) with dtype `out_dtype`.
            Side effects:
            - Writes the computed output block into the global tensor `C`.
            - Uses and updates shared memory buffers and per-thread accumulators.
            No value is returned.
        """
        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
@@ -352,14 +352,14 @@ def matmul(M,
 def ref_program_twiddling(A, qB):
    """
    Compute reference BF16 matrix multiply using bit-twiddled FP4 quantized B.
    Converts qB (a bit-twiddled, packed FP4 representation of matrix B) back to floating,
    performs C = A @ B^T in full precision, and returns the result converted to bfloat16.
    Parameters:
        A (torch.Tensor): Left operand with shape (M, K). Treated as floating-point (converted to torch.float for compute).
        qB (torch.Tensor): Bit-twiddled, packed FP4 representation of B (quantized). Shape corresponds to B's packed layout.
    Returns:
        torch.Tensor: Result matrix C with shape (M, N) in bfloat16.
    """
@@ -373,13 +373,13 @@ def ref_program_twiddling(A, qB):
 def ref_program_simple(A, qB):
    """
    Compute a reference BF16 matrix multiply using a simple (non-twiddled) dequantization of qB.
    Converts the quantized tensor `qB` to full-precision values via `torch_convert`, computes C = A @ B^T in float32, and casts the result to bfloat16 before returning.
    Parameters:
        A (torch.Tensor): Left input matrix with shape (M, K).
        qB (torch.Tensor): Quantized representation of the right matrix; expected to be compatible with `torch_convert` and represent a matrix whose transpose will be multiplied by A.
    Returns:
        torch.Tensor: Resulting matrix C in bfloat16 with shape (M, N).
    """
@@ -393,16 +393,16 @@ def ref_program_simple(A, qB):
 def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
    """
    Run and benchmark the tiled, optionally autotuned FP4->BF16 GEMM kernel and validate results against a PyTorch reference.
    This function builds a matmul kernel (either with autotuning or fixed tiling), obtains a profiler, validates numerical correctness against the appropriate reference implementation (bit-twiddled fast dequantization or simple dequantization), and runs a benchmark that prints measured latency (ms) and effective TFLOPs.
    Parameters:
        m (int): Number of rows of A and output C (default 256).
        n (int): Number of columns of B and output C (default 256).
        k (int): Inner dimension (columns of A, rows of B) (default 256).
        fast_dequant (bool): If True use the fast twiddling dequantization path and validate against the twiddling reference; otherwise use the simple dequant path (default True).
        tune (bool): If True build the kernel with autotuning configurations; if False use a fixed tiling and threading configuration for reproducible benchmarking (default False).
    Side effects:
        - Prints latency and TFLOPs to stdout.
        - Raises an assertion via the profiler if the kernel's outputs do not match the chosen reference within the tolerances (rtol=0.01, atol=0.01).

--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -11,21 +11,21 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
                          dtype: str):
    """
        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
        Parameters:
            nbit (int): Number of bits in the packed field (must be 4).
            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
            dtype (str): Destination dtype string (must be "bfloat16").
        Returns:
            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
        Notes:
        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
@@ -52,7 +52,7 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
 def get_configs():
    """
    Generate a list of hyperparameter configuration dictionaries for tuning.
    Each configuration is a dict with keys: 'block_M', 'block_N', 'block_K',
    'num_stages', 'threads', and 'split'. The function returns the Cartesian
    product of the parameter value lists:
@@ -60,7 +60,7 @@ def get_configs():
    - num_stages: pipeline stages (0, 2)
    - threads: thread counts (128, 256, 512)
    - split: K-splitting factor (1, 2)
    Returns:
        List[dict]: A list of configuration dictionaries covering all combinations.
    """
@@ -99,7 +99,7 @@ def matmul(M,
           split=1):
    """
        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
        The generated kernel accepts:
        - A: dense matrix with element type `in_dtype`.
        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
@@ -107,7 +107,7 @@ def matmul(M,
        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
        - fast_dequant (False): uses a simple elementwise dequantization helper.
        Parameters:
        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
        in_dtype (str): element type of A (e.g., "fp4" in this file).
@@ -129,7 +129,7 @@ def matmul(M,
        - dequantizes B via the chosen path into a shared dequantized tile,
        - performs a tiled GEMM accumulating into local fragments,
        - writes the final MxN block to the global output tensor.
        Notes:
        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
@@ -167,13 +167,13 @@ def matmul(M,
    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
        """
        Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
        The returned macro has signature (B_shared, B_dequantize_shared, Scale, k) and:
        - Loads packed FP4 elements from B_shared into per-thread local registers.
        - Calls an external fast dequantization intrinsic (provided via `import_source` / `func_name` in the outer scope) to expand packed FP4 -> BF16 values.
        - Applies a per-block scale factor derived from the Scale tensor (using exponentiation by powers of two).
        - Writes the scaled BF16 results into B_dequantize_shared.
        Notes:
        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
        - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
@@ -194,21 +194,21 @@ def matmul(M,
            Fast dequantization kernel: convert packed 4-bit quantized values in B_shared to bfloat16
            in B_dequantize_shared using an external intrinsic optimized for twiddled (bit-packed) FP4,
            applying per-block scale factors from Scale.
            This routine is a tiled, thread-parallel helper that:
            - Imports and calls an external dequantization function (via `import_source`/`func_name`)
              to expand compressed uint8-packed FP4 values into BF16 fragments in-thread.
            - Loads the corresponding per-block scale entry, interprets it as an exponent bias
              (applies 2^(Scale - 127)), and multiplies the dequantized BF16 fragment by that factor.
            - Writes the scaled BF16 results back into the shared B_dequantize_shared buffer in-place.
            Parameters:
            - B_shared: read-only shared buffer containing compressed FP4 data (packed uint8 layout).
            - B_dequantize_shared: shared output buffer that is overwritten with BF16 dequantized values.
            - Scale: per-block scale tensor; entries are interpreted such that the multiplicative scale
              = 2^(Scale - 127).
            - k: block index along the K dimension used to select the appropriate Scale entries.
            Side effects:
            - Mutates B_dequantize_shared in shared memory.
            - Calls an external intrinsic function (must be provided by the environment via `import_source`
@@ -260,9 +260,9 @@ def matmul(M,
    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
        """
        Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
        Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
        Notes:
        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
        - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
@@ -275,18 +275,18 @@ def matmul(M,
        def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale, k):
            """
            Dequantizes a packed 4-bit (FP4) block from B_shared into BF16 values in B_dequantize_shared using per-element scale exponents.
            Per-element behavior:
            - Reads packed 4-bit entries from B_shared (uint8 storage, multiple nibbles per byte).
            - Uses Scale to obtain an exponent term (stored as uint8) and reconstructs BF16 values via _tir_u8_to_f4_to_bf16.
            - Writes the dequantized BF16 block into B_dequantize_shared.
            Parameters:
            - B_shared: shared-memory buffer holding packed 4-bit values (uint8-packed layout).
            - B_dequantize_shared: shared-memory buffer to receive dequantized BF16 results.
            - Scale: per-element exponent buffer; used to compute the scale factor for each dequantized element.
            - k: current block index along the K dimension (used to select the appropriate slice of Scale).
            Side effects:
            - Mutates B_dequantize_shared by storing the dequantized BF16 fragment.
            """
@@ -320,9 +320,9 @@ def matmul(M,
    ):
        """
            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
            Parameters are self-descriptive in the signature; notable behaviors:
            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
@@ -376,14 +376,14 @@ def matmul(M,
 def ref_program_twiddling(A, qB, Scale, Bias=None):
    """
    Compute A @ B^T where B is reconstructed from bit-twiddled 4-bit quantized data and per-block scales, returning bfloat16 results.
    Converts the quantized matrix `qB` to floating-point via `torch_convert_bit_twiddling`, applies a per-element scale factor of 2^(Scale - 127) (where Scale indexes are grouped by 32 columns of B), computes the matrix product A · B^T in float, and casts the result to bfloat16.
    Parameters:
        A (torch.Tensor): Left operand with shape (M, K), used in floating precision.
        qB (torch.Tensor): Quantized representation of B (packed 4-bit values) compatible with torch_convert_bit_twiddling.
        Scale (torch.Tensor): Per-column-group scale values; Scale indices correspond to groups of 32 columns in B.
    Returns:
        torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
    """
@@ -400,9 +400,9 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
 def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
    """
    Compute A @ B^T where B is reconstructed from bit-twiddled 4-bit quantized data and per-block scales, returning bfloat16 results.
    Converts the quantized matrix `qB` to floating-point via `torch_convert_bit_twiddling`, applies a per-element scale factor of 2^(Scale - 127) (where Scale indexes are grouped by 32 columns of B), computes the matrix product A · B^T in float, and casts the result to bfloat16.
    Parameters:
        A (torch.Tensor): Left operand with shape (M, K), used in floating precision.
        qB (torch.Tensor): Quantized representation of B (packed 4-bit values) compatible with torch_convert_bit_twiddling.
@@ -425,17 +425,17 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
 def ref_program_simple(A, qB, Scale, Bias=None):
    """
    Compute a BF16 matrix product A · B^T from a quantized B with simple (non-twiddling) dequantization.
    Converts the quantized tensor `qB` to floating B via `torch_convert`, applies a per-element scale factor computed as 2^(Scale[i][j//32] - 127) (Scale supplies exponent offsets in 32-column groups), then computes C = A · B^T and returns the result converted to bfloat16.
    Parameters:
    - A: 2D tensor representing the left operand (will be cast to float32 for the matmul).
    - qB: Quantized representation of B accepted by `torch_convert`.
    - Scale: 2D tensor of exponent offsets; Scale[i][g] is applied to columns j where g == j // 32.
    Returns:
    - 2D bfloat16 tensor C containing the matrix product A · B^T.
    No in-place modification is performed on inputs (a local floating copy of B is scaled).
    """
    dtypeC = "bfloat16"
@@ -451,9 +451,9 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 def ref_program_simple_with_bias(A, qB, Scale, Bias):
    """
    Compute a BF16 matrix product A · B^T from a quantized B with simple (non-twiddling) dequantization.
    Converts the quantized tensor `qB` to floating B via `torch_convert`, applies a per-element scale factor computed as 2^(Scale[i][j//32] - 127) (Scale supplies exponent offsets in 32-column groups), then computes C = A · B^T and returns the result converted to bfloat16.
    Parameters:
    Returns:
@@ -465,7 +465,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
    Returns:
    - 2D bfloat16 tensor C containing the matrix product A · B^T.
    No in-place modification is performed on inputs (a local floating copy of B is scaled).
    """
    dtypeC = "bfloat16"
@@ -481,9 +481,9 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False, tune=False):
    """
    Run and validate the tiled quantized matmul kernel, then benchmark its latency and report TFLOPS.
    Builds a matmul kernel for the given matrix sizes and quantization scale size. If `tune` is True the kernel is obtained via the autotuning path; otherwise a fixed-parameter kernel is used. Validates numerical correctness against the appropriate reference implementation (bit-twiddling reference when `fast_dequant` is True, plain reference otherwise) with rtol/atol=0.01, prints a confirmation, then runs a benchmark (500 warmup iterations) and prints the measured latency (ms) and achieved TFLOPS.
    Parameters:
        m (int): Number of rows of A / output rows. Default 256.
        n (int): Number of columns of B / output columns. Default 256.
@@ -491,7 +491,7 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
        scale_size (int): Size of the per-block scale vector used for dequantization. Default 32.
        fast_dequant (bool): If True validate against the twiddling (fast dequant) reference and exercise the fast dequant path; otherwise use the simple dequant reference. Default True.
        tune (bool): If True obtain a tuned/autotuned kernel; otherwise use a fixed-parameter kernel. Default False.
    Returns:
        None
    """

--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
@@ -11,21 +11,21 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
                          dtype: str):
    """
        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
        Parameters:
            nbit (int): Number of bits in the packed field (must be 4).
            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
            dtype (str): Destination dtype string (must be "bfloat16").
        Returns:
            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
        Notes:
        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
@@ -52,7 +52,7 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
 def get_configs():
    """
    Generate a list of hyperparameter configuration dictionaries for tuning.
    Each configuration is a dict with keys: 'block_M', 'block_N', 'block_K',
    'num_stages', 'threads', and 'split'. The function returns the Cartesian
    product of the parameter value lists:
@@ -60,7 +60,7 @@ def get_configs():
    - num_stages: pipeline stages (0, 2)
    - threads: thread counts (128, 256, 512)
    - split: K-splitting factor (1, 2)
    Returns:
        List[dict]: A list of configuration dictionaries covering all combinations.
    """
@@ -99,7 +99,7 @@ def matmul(M,
           split=1):
    """
        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
        The generated kernel accepts:
        - A: dense matrix with element type `in_dtype`.
        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
@@ -107,7 +107,7 @@ def matmul(M,
        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
        - fast_dequant (False): uses a simple elementwise dequantization helper.
        Parameters:
        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
        in_dtype (str): element type of A (e.g., "fp4" in this file).
@@ -129,7 +129,7 @@ def matmul(M,
        - dequantizes B via the chosen path into a shared dequantized tile,
        - performs a tiled GEMM accumulating into local fragments,
        - writes the final MxN block to the global output tensor.
        Notes:
        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
@@ -167,13 +167,13 @@ def matmul(M,
    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
        """
        Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
        The returned macro has signature (B_shared, B_dequantize_shared, Scale, k) and:
        - Loads packed FP4 elements from B_shared into per-thread local registers.
        - Calls an external fast dequantization intrinsic (provided via `import_source` / `func_name` in the outer scope) to expand packed FP4 -> BF16 values.
        - Applies a per-block scale factor derived from the Scale tensor (using exponentiation by powers of two).
        - Writes the scaled BF16 results into B_dequantize_shared.
        Notes:
        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
        - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
@@ -194,21 +194,21 @@ def matmul(M,
            Fast dequantization kernel: convert packed 4-bit quantized values in B_shared to bfloat16
            in B_dequantize_shared using an external intrinsic optimized for twiddled (bit-packed) FP4,
            applying per-block scale factors from Scale.
            This routine is a tiled, thread-parallel helper that:
            - Imports and calls an external dequantization function (via `import_source`/`func_name`)
              to expand compressed uint8-packed FP4 values into BF16 fragments in-thread.
            - Loads the corresponding per-block scale entry, interprets it as an exponent bias
              (applies 2^(Scale - 127)), and multiplies the dequantized BF16 fragment by that factor.
            - Writes the scaled BF16 results back into the shared B_dequantize_shared buffer in-place.
            Parameters:
            - B_shared: read-only shared buffer containing compressed FP4 data (packed uint8 layout).
            - B_dequantize_shared: shared output buffer that is overwritten with BF16 dequantized values.
            - Scale: per-block scale tensor; entries are interpreted such that the multiplicative scale
              = 2^(Scale - 127).
            - k: block index along the K dimension used to select the appropriate Scale entries.
            Side effects:
            - Mutates B_dequantize_shared in shared memory.
            - Calls an external intrinsic function (must be provided by the environment via `import_source`
@@ -260,9 +260,9 @@ def matmul(M,
    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
        """
        Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
        Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
        Notes:
        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
        - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
@@ -275,18 +275,18 @@ def matmul(M,
        def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
            """
            Dequantizes a packed 4-bit (FP4) block from B_shared into BF16 values in B_dequantize_shared using per-element scale exponents.
            Per-element behavior:
            - Reads packed 4-bit entries from B_shared (uint8 storage, multiple nibbles per byte).
            - Uses Scale to obtain an exponent term (stored as uint8) and reconstructs BF16 values via _tir_u8_to_f4_to_bf16.
            - Writes the dequantized BF16 block into B_dequantize_shared.
            Parameters:
            - B_shared: shared-memory buffer holding packed 4-bit values (uint8-packed layout).
            - B_dequantize_shared: shared-memory buffer to receive dequantized BF16 results.
            - Scale: per-element exponent buffer; used to compute the scale factor for each dequantized element.
            - k: current block index along the K dimension (used to select the appropriate slice of Scale).
            Side effects:
            - Mutates B_dequantize_shared by storing the dequantized BF16 fragment.
            """
@@ -319,9 +319,9 @@ def matmul(M,
    ):
        """
            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
            Parameters are self-descriptive in the signature; notable behaviors:
            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
@@ -384,14 +384,14 @@ def matmul(M,
 def ref_program_twiddling(A, qB, Scale, Bias=None):
    """
    Compute A @ B^T where B is reconstructed from bit-twiddled 4-bit quantized data and per-block scales, returning bfloat16 results.
    Converts the quantized matrix `qB` to floating-point via `torch_convert_bit_twiddling`, applies a per-element scale factor of 2^(Scale - 127) (where Scale indexes are grouped by 32 columns of B), computes the matrix product A · B^T in float, and casts the result to bfloat16.
    Parameters:
        A (torch.Tensor): Left operand with shape (M, K), used in floating precision.
        qB (torch.Tensor): Quantized representation of B (packed 4-bit values) compatible with torch_convert_bit_twiddling.
        Scale (torch.Tensor): Per-column-group scale values; Scale indices correspond to groups of 32 columns in B.
    Returns:
        torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
    """
@@ -408,9 +408,9 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
 def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
    """
    Compute A @ B^T where B is reconstructed from bit-twiddled 4-bit quantized data and per-block scales, returning bfloat16 results.
    Converts the quantized matrix `qB` to floating-point via `torch_convert_bit_twiddling`, applies a per-element scale factor of 2^(Scale - 127) (where Scale indexes are grouped by 32 columns of B), computes the matrix product A · B^T in float, and casts the result to bfloat16.
    Parameters:
        A (torch.Tensor): Left operand with shape (M, K), used in floating precision.
        qB (torch.Tensor): Quantized representation of B (packed 4-bit values) compatible with torch_convert_bit_twiddling.
@@ -433,17 +433,17 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
 def ref_program_simple(A, qB, Scale, Bias=None):
    """
    Compute a BF16 matrix product A · B^T from a quantized B with simple (non-twiddling) dequantization.
    Converts the quantized tensor `qB` to floating B via `torch_convert`, applies a per-element scale factor computed as 2^(Scale[i][j//32] - 127) (Scale supplies exponent offsets in 32-column groups), then computes C = A · B^T and returns the result converted to bfloat16.
    Parameters:
    - A: 2D tensor representing the left operand (will be cast to float32 for the matmul).
    - qB: Quantized representation of B accepted by `torch_convert`.
    - Scale: 2D tensor of exponent offsets; Scale[i][g] is applied to columns j where g == j // 32.
    Returns:
    - 2D bfloat16 tensor C containing the matrix product A · B^T.
    No in-place modification is performed on inputs (a local floating copy of B is scaled).
    """
    dtypeC = "bfloat16"
@@ -459,9 +459,9 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 def ref_program_simple_with_bias(A, qB, Scale, Bias):
    """
    Compute a BF16 matrix product A · B^T from a quantized B with simple (non-twiddling) dequantization.
    Converts the quantized tensor `qB` to floating B via `torch_convert`, applies a per-element scale factor computed as 2^(Scale[i][j//32] - 127) (Scale supplies exponent offsets in 32-column groups), then computes C = A · B^T and returns the result converted to bfloat16.
    Parameters:
    Returns:
@@ -473,7 +473,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
    Returns:
    - 2D bfloat16 tensor C containing the matrix product A · B^T.
    No in-place modification is performed on inputs (a local floating copy of B is scaled).
    """
    dtypeC = "bfloat16"
@@ -489,9 +489,9 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False, tune=False):
    """
    Run and validate the tiled quantized matmul kernel, then benchmark its latency and report TFLOPS.
    Builds a matmul kernel for the given matrix sizes and quantization scale size. If `tune` is True the kernel is obtained via the autotuning path; otherwise a fixed-parameter kernel is used. Validates numerical correctness against the appropriate reference implementation (bit-twiddling reference when `fast_dequant` is True, plain reference otherwise) with rtol/atol=0.01, prints a confirmation, then runs a benchmark (500 warmup iterations) and prints the measured latency (ms) and achieved TFLOPS.
    Parameters:
        m (int): Number of rows of A / output rows. Default 256.
        n (int): Number of columns of B / output columns. Default 256.
@@ -499,7 +499,7 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
        scale_size (int): Size of the per-block scale vector used for dequantization. Default 32.
        fast_dequant (bool): If True validate against the twiddling (fast dequant) reference and exercise the fast dequant path; otherwise use the simple dequant reference. Default True.
        tune (bool): If True obtain a tuned/autotuned kernel; otherwise use a fixed-parameter kernel. Default False.
    Returns:
        None
    """

--- a/examples/dequantize_gemm/utils.py
+++ b/examples/dequantize_gemm/utils.py
@@ -4,15 +4,15 @@ import torch
 def torch_convert_bit_twiddling(tensor):
    """
    Convert a 2-D uint8 tensor into a bfloat16 tensor by decoding pairs of input bytes with a bit-twiddling scheme.
    This function expects `tensor` to be a 2-D torch.Tensor of dtype `torch.uint8`. Each output element is produced by combining two input bytes and extracting a bf16-like 16-bit pattern according to one of four positional bit layouts (pos 0..3). The result is scaled by 2**126 to adjust the exponent bias and returned as dtype `torch.bfloat16`.
    Parameters:
        tensor (torch.Tensor): 2-D input tensor with dtype `torch.uint8`. Shape (N, K).
    Returns:
        torch.Tensor: New tensor of dtype `torch.bfloat16` with shape (N, K*2), where each input column pair produces two bf16 output columns.
    Raises:
        AssertionError: If any byte inputs used for a conversion are not dtype `torch.uint8`.
    """
@@ -53,14 +53,14 @@ def torch_convert_bit_twiddling(tensor):
 def torch_convert(tensor, scale_size=None, Scale=None):
    """
    Decode a 2D uint8 tensor into a 2D bfloat16 tensor by expanding each byte into two bf16 values using a 4-bit (nibble) encoding.
    Each input byte holds two 4-bit encoded values (low and high nibble). For each nibble this function derives sign/scale bits, a 3-bit exponent fragment and a 1-bit mantissa fragment, assembles a 16-bit bf16 pattern, and returns the resulting tensor with shape (N, K*2) and dtype torch.bfloat16 on the same device as the input.
    Parameters:
        tensor (torch.Tensor): 2D tensor of dtype torch.uint8 and shape (N, K). Each byte contains two encoded 4-bit entries that become two bf16 values.
        scale_size (int, optional): If provided, controls how elements of the optional Scale tensor are indexed. When supplied, per-output-element scaling is applied to the exponent using Scale.
        Scale (torch.Tensor, optional): A 2D tensor used to supply per-element integer scale adjustments to the exponent. If scale_size is provided, the scale used for output element (i, j) is Scale[i][j // scale_size].
    Returns:
        torch.Tensor: A new tensor of shape (N, K*2) and dtype torch.bfloat16 containing the decoded bf16 values.
    """
@@ -96,9 +96,9 @@ def torch_convert(tensor, scale_size=None, Scale=None):
 def print_bit(name, val):
    """
    Print the 32-bit binary representation of a CPU scalar extracted from a PyTorch tensor.
    Converts `val` to CPU, reads its Python scalar with `.item()`, formats it as a 32-bit binary string, and prints it prefixed by `name`.
    Parameters:
        name (str): Label printed before the binary representation.
        val (torch.Tensor): A scalar PyTorch tensor (numeric) whose 32-bit binary representation will be shown.

--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -478,13 +478,13 @@ class MoE(nn.Module):
 def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
    """
    DeepSeek-style Mixture of Experts using Tilelang.
    Args:
        data: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
            - input: Input tensor of shape [batch_size, seq_len, hidden_size]
            - weights: Dictionary containing model weights
            - config: Dictionary containing model configuration parameters
    Returns:
        Tuple containing:
            - output: Processed tensor [batch_size, seq_len, d_model]

--- a/examples/fusedmoe/example_fusedmoe_torch.py
+++ b/examples/fusedmoe/example_fusedmoe_torch.py
@@ -100,13 +100,13 @@ class MoETorch(nn.Module):
 def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
    """
    Reference implementation of DeepSeek-style Mixture of Experts using PyTorch.
    Args:
        data: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
            - input: Input tensor of shape [batch_size, seq_len, hidden_dim]
            - weights: Dictionary containing model weights
            - config: Dictionary containing model configuration parameters
    Returns:
        Tuple containing:
            - output: Processed tensor [batch_size, seq_len, d_model]

--- a/examples/gdn/utils.py
+++ b/examples/gdn/utils.py
@@ -37,4 +37,4 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
        if raise_assert:
            raise AssertionError
    else:
        print(f"{name} {data} passed")
\ No newline at end of file
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -13,7 +13,7 @@ import torch
 def ref_program(A, B):
    """
    Compute the matrix product of A and the transpose of B.
    A and B are expected to be 2-D tensors where A has shape (M, K) and B has shape (N, K). The result is a tensor with shape (M, N) equal to A @ B.T, using the inputs' dtypes.
    """
    return A @ B.T
@@ -22,26 +22,26 @@ def ref_program(A, B):
 def get_configs(M, N, K, with_roller=False, topk=20):
    """
    Generate a list of kernel tuning configuration dictionaries for a tiled matrix-multiply.
    When with_roller is True this queries the MatmulTemplate roller to produce up to `topk` recommended
    configurations (device-specific TensorCore-friendly tilings). Each returned dict contains:
      - block_M, block_N, block_K: tile sizes
      - num_stages: pipeline staging (0 means no explicit staging)
      - thread_num: total threads used for the block
      - enable_rasteration: whether a rasterization/swizzle layout was recommended (note spelling)
    When with_roller is False this returns the Cartesian product of a fixed set of candidate
    parameters; the returned dicts use the backward-compatible key name "enable_rasteration" for that flag.
    Parameters:
        M, N, K (int): GEMM dimensions used to generate valid tile sizes.
        with_roller (bool): If True, use MatmulTemplate's roller to generate device-aware hints;
            otherwise use a predefined candidate grid.
        topk (int): Maximum number of roller hints to request when with_roller is True.
    Returns:
        List[dict]: A list of configuration dictionaries as described above.
    Raises:
        ValueError: if with_roller is True but the roller returns no hints.
    """

--- a/examples/linear_attention/example_retention_fwd.py
+++ b/examples/linear_attention/example_retention_fwd.py
@@ -33,7 +33,7 @@ def chunk_retention_fwd_kernel(
            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore 
+            O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
    ):
        with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
            i_b = i_bh // H

--- a/examples/minference/test_vs_sparse_attn.py
+++ b/examples/minference/test_vs_sparse_attn.py
@@ -9,4 +9,4 @@ def test_vs_sparse_attn():
 if __name__ == "__main__":
    tilelang.testing.main()
\ No newline at end of file
--- a/examples/seer_attention/test_block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/test_block_sparse_attn_tilelang.py
@@ -9,4 +9,4 @@ def test_block_sparse_attn_tilelang():
 if __name__ == "__main__":
    tilelang.testing.main()
\ No newline at end of file
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ skip = [
 [tool.ruff.lint]
 select = [
    # pycodestyle
-    "E",
+    "E", "W",
    # Pyflakes
    "F",
    # pyupgrade
@@ -59,3 +59,5 @@ ignore = [
    # No such file or directory
    "E902",
 ]
+[tool.ruff.lint.per-file-ignores]
+"3rdparty/**/*" = ["ALL"]
--- a/setup.py
+++ b/setup.py
@@ -738,16 +738,16 @@ class TilelangExtensionBuild(build_ext):
    def build_cmake(self, ext):
        """
        Build a single CMake-based extension by generating a CMake config and invoking CMake/Ninja.
        Generates or updates a config.cmake in the build directory (based on the extension's sourcedir),
        injecting LLVM/CUDA/ROCm and Python settings, then runs CMake to configure and build the target.
        When running an in-place build the resulting library is placed under ./tilelang/lib; otherwise the
        standard extension output directory is used.
        Parameters:
            ext: The CMakeExtension to build; its `sourcedir` should contain the TVM/CMake `config.cmake`
                 template under `3rdparty/tvm/cmake/`.
        Raises:
            subprocess.CalledProcessError: If the CMake configuration or build commands fail.
            OSError: If filesystem operations (read/write) fail.

--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -32,7 +32,7 @@ def ref_program(A, B):
 def get_configs(M, N, K, with_roller=False):
    """
    Generate a list of configuration dictionaries that will be used for tuning.
    Parameters
    ----------
    with_roller : bool