init

bc2d5632 · root · bc2d5632 · bc2d5632 · bc2d5632 · bc2d5632
Commit bc2d5632 authored Jan 15, 2026 by root
20 changed files
--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
+import argparse
+import logging
+from tilelang import tvm as tvm
+from tvm import DataType
+import tilelang as tl
+import tilelang.language as T
+from tilelang.intrinsics import get_swizzle_layout
+from tilelang.intrinsics.mma_macro_generator import (
+    TensorCoreIntrinEmitter,)
+from tilelang.transform import simplify_prim_func
+from tilelang.autotuner import autotune
+import itertools
+
+# Configure logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+def make_swizzle_layout(shared_buf):
+    dtype = shared_buf.dtype
+    shape = shared_buf.shape
+
+    can_swizzle = shape[-1] * DataType(dtype).bits == 512
+    if not can_swizzle:
+        return T.Layout(shape, lambda *args: args)
+
+    def transform_func(i, j):
+        new_warp_i, new_warp_j = get_swizzle_layout(i, j, shape[-1], dtype)
+        return [new_warp_i, new_warp_j]
+
+    return T.Layout(shape, transform_func)
+
+
+@simplify_prim_func
+def tl_matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    block_row_warps=1,
+    block_col_warps=1,
+    warp_row_tiles=16,
+    warp_col_tiles=16,
+    chunk=32,
+    stage=2,
+    enable_rasteration=False,
+):
+    assert in_dtype in [
+        "float16",
+        "int8",
+    ], "Currently only float16 and int8 are supported"
+    assert out_dtype in [
+        "float16",
+        "float32",
+        "int32",
+    ], "Currently only float16, float32 and int32 are supported"
+
+    micro_size_x = micro_size_y = micro_size_k = 16
+
+    if out_dtype == "int32":
+        micro_size_k = 32
+
+    # This is a debug config
+    # chunk = 32 if in_dtype == "float16" else 64
+    shared_scope = "shared.dyn"
+
+    block_M = block_row_warps * warp_row_tiles
+    block_N = block_col_warps * warp_col_tiles
+    block_K = chunk
+
+    A_shape = (M, K)
+    B_shape = (N, K)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_N, block_K)
+    C_shared_shape = (
+        block_M,
+        block_N,
+    )
+
+    warp_size = 32
+    threads = warp_size * (block_row_warps * block_col_warps)
+    local_size_a = (micro_size_x * micro_size_k) // warp_size
+    local_size_b = (micro_size_y * micro_size_k) // warp_size
+    local_size_c = (micro_size_x * micro_size_y) // warp_size
+    warp_rows = warp_row_tiles // micro_size_x
+    warp_cols = warp_col_tiles // micro_size_y
+
+    # MMA Wrapper to Auto Generate Code for MMA
+    mma_emitter = TensorCoreIntrinEmitter(
+        a_dtype=in_dtype,
+        b_dtype=in_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=False,
+        b_transposed=True,
+        block_row_warps=block_row_warps,
+        block_col_warps=block_col_warps,
+        warp_row_tiles=warp_row_tiles,
+        warp_col_tiles=warp_col_tiles,
+        chunk=chunk,
+    )
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
+            C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
+            A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+            C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
+
+            T.annotate_layout({
+                A_shared: make_swizzle_layout(A_shared),
+                B_shared: make_swizzle_layout(B_shared),
+            })
+
+            # Improve L2 Cache
+            T.use_swizzle(panel_size=10, enable=enable_rasteration)
+
+            T.clear(C_local)
+
+            for ko in T.Pipelined((K // block_K), num_stages=stage):
+
+                # Load A into shared memory
+                for i, k in T.Parallel(block_M, block_K):
+                    A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
+
+                # Load B into shared memory
+                for j, k in T.Parallel(block_N, block_K):
+                    B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
+
+                for ki in T.serial(0, (block_K // micro_size_k)):
+
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(A_local, A_shared, ki)
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(B_local, B_shared, ki)
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma(A_local, B_local, C_local)
+
+            # Perform STMatrix
+            mma_emitter.stmatrix(C_local, C_shared)
+
+            # Store shared into global
+            for i, j in T.Parallel(block_M, block_N):
+                C[by * block_M + i, bx * block_N + j] = C_shared[i, j]
+
+    return main
+
+
+def ref_program(A, B):
+    """Reference matrix multiplication program."""
+    return A @ B.T
+
+
+def get_configs(args, kwargs):
+    """
+    Generate a list of configuration dictionaries that will be used for tuning.
+
+    Parameters
+    ----------
+    with_roller : bool
+        Whether to enable bitblas roller to deduce search spaces
+
+    Returns
+    -------
+    list of dict
+        Each configuration dict includes various block sizes, pipeline stages,
+        thread numbers, and other parameters to explore during autotuning.
+    """
+    M, N, K = args[:3]
+    with_roller = args[6]
+
+    if with_roller:
+        from tilelang.carver.template import MatmulTemplate
+        from tilelang.carver.arch import CUDA
+        from tilelang.carver.arch import CDNA
+        from tilelang.carver.roller.rasterization import NoRasterization
+        import torch
+
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
+        topk = 10
+
+        carve_template = MatmulTemplate(
+            M=M,
+            N=N,
+            K=K,
+            in_dtype="float16",
+            out_dtype="float16",
+            accum_dtype="float16",
+        ).with_arch(arch)
+
+        func = carve_template.equivalent_function()
+        assert func is not None, "Function is None"
+
+        roller_hints = carve_template.recommend_hints(topk=topk)
+
+        if roller_hints is None:
+            raise ValueError("No Roller Hints Found for TensorCore Scheduling")
+
+        configs = []
+        for hint in roller_hints:
+            config = {}
+            block_m, block_n = hint.block
+            warp_m, warp_n = hint.warp
+            config["block_row_warps"] = block_m // warp_m
+            config["block_col_warps"] = block_n // warp_n
+            config["warp_row_tiles"] = warp_m
+            config["warp_col_tiles"] = warp_n
+            config["chunk"] = hint.rstep[0]
+            config["stage"] = hint.pipeline_stage
+            config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
+            configs.append(config)
+        for config in configs:
+            print(config)
+    else:
+
+        iter_params = dict(
+            block_row_warps=[1, 2, 4],
+            block_col_warps=[1, 2, 4],
+            warp_row_tiles=[16, 32, 64, 128],
+            warp_col_tiles=[16, 32, 64, 128],
+            chunk=[32, 64, 128, 256],
+            stage=[0, 2],
+            enable_rasteration=[True, False],
+        )
+        return [{
+            k: v for k, v in zip(iter_params, values)
+        } for values in itertools.product(*iter_params.values())]
+
+    return configs
+
+
+@autotune(
+    configs=get_configs,
+    warmup=3,
+    rep=5,
+    ref_prog=ref_program,
+    skip_check=True,
+)
+@tl.jit(out_idx=[2],)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype="float16",
+    out_dtype="float16",
+    accum_dtype="float16",
+    with_roller=False,
+    block_row_warps=None,
+    block_col_warps=None,
+    warp_row_tiles=None,
+    warp_col_tiles=None,
+    chunk=None,
+    stage=None,
+    enable_rasteration=None,
+):
+    """Create an autotuned tensor core matrix multiplication kernel."""
+
+    def kernel():
+        return tl_matmul(
+            M,
+            N,
+            K,
+            in_dtype=in_dtype,
+            out_dtype=out_dtype,
+            accum_dtype=accum_dtype,
+            block_row_warps=block_row_warps,
+            block_col_warps=block_col_warps,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=chunk,
+            stage=stage,
+            enable_rasteration=enable_rasteration,
+        )
+
+    return kernel()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Autotuned TensorCore MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument(
+        "--with_roller",
+        type=bool,
+        default=False,
+        help="Whether to use roller to deduce search spaces")
+    parser.add_argument(
+        "--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
+    args = parser.parse_args()
+
+    M, N, K = args.m, args.n, args.k
+    in_dtype = args.dtype
+    out_dtype = "float32" if in_dtype == "int8" else "float16"
+    accum_dtype = "float32" if in_dtype == "int8" else "float16"
+    with_roller = args.with_roller
+    with_roller = True
+    # Compute total floating-point operations
+    total_flops = 2 * M * N * K
+
+    # Run autotuning
+    best_result = matmul(M, N, K, in_dtype, out_dtype, accum_dtype, with_roller)
+    best_latency = best_result.latency
+    best_config = best_result.config
+    ref_latency = best_result.ref_latency
+
+    # Print benchmark results
+    print(f"Best latency (s): {best_latency}")
+    print(f"Best TFlops: {total_flops / best_latency * 1e-9:.3f}")
+    print(f"Best config: {best_config}")
+    print(f"Reference TFlops: {total_flops / ref_latency * 1e-9:.3f}")
--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
+import argparse
+import itertools
+import logging
+import torch
+from triton.testing import do_bench
+
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+from tilelang import jit
+from tilelang.contrib import nvcc
+from tilelang.layout import make_metadata_layout
+
+# Configure logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+arch = nvcc.get_target_compute_version()
+
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
+
+def ref_program(A, B):
+    """
+    A reference matrix multiplication program, used to compare performance.
+
+    Parameters
+    ----------
+    A : numpy.ndarray
+        The matrix with shape (M, K).
+    B : numpy.ndarray
+        The matrix with shape (N, K).
+
+    Returns
+    -------
+    np.ndarray
+        The result of A @ B.T, shape (M, N).
+    """
+    return A @ B.T
+
+
+def get_configs(M, N, K):
+    """
+    Generate a list of configuration dictionaries that will be used for tuning.
+
+    Parameters
+    ----------
+    with_roller : bool
+        Whether to enable bitblas roller to deduce search spaces
+
+    Returns
+    -------
+    list of dict
+        Each configuration dict includes various block sizes, pipeline stages,
+        thread numbers, and other parameters to explore during autotuning.
+    """
+    block_M = [64, 128, 256]
+    block_N = [64, 128, 256]
+    block_K = [64, 128]
+    num_stages = [0, 1, 2, 3]
+    thread_num = [128, 256]
+    enable_rasterization = [True, False]
+    policy = [T.GemmWarpPolicy.Square]
+    _configs = list(
+        itertools.product(
+            block_M,
+            block_N,
+            block_K,
+            num_stages,
+            thread_num,
+            policy,
+            enable_rasterization,
+        ))
+
+    configs = [
+        {
+            "block_M": c[0],
+            "block_N": c[1],
+            "block_K": c[2],
+            "num_stages": c[3],
+            "thread_num": c[4],
+            "policy": c[5],
+            "enable_rasterization": c[6],  # keep param name for backward-compat
+        } for c in _configs
+    ]
+    return configs
+
+
+def matmul_sp(M, N, K, accum_dtype):
+    """
+    Create an autotuned matrix multiplication kernel for matrices of shape:
+      - A: (M, K)
+      - B: (K, N)
+      - C: (M, N)
+
+    Parameters
+    ----------
+    M : int
+        The dimension M of the matrix multiplication.
+    N : int
+        The dimension N of the matrix multiplication.
+    K : int
+        The dimension K of the matrix multiplication.
+
+    Returns
+    -------
+    (best_latency, best_config, ref_latency)
+        best_latency : float
+            The best latency found among the tuned configurations.
+        best_config : dict
+            The parameter configuration that yielded best_latency.
+        ref_latency : float
+            The baseline latency of the reference program (for computing speedup).
+    """
+
+    # Decorate the kernel with autotune & jit, specifying:
+    #  - Tuning config list
+    #  - Profiling keys
+    #  - Warmup and repetition counts for better measurement
+    #  - A reference program for correctness verification
+    #  - The "tvm" profiler backend
+    #  - HIP as the compilation target (modify as needed for your hardware)
+
+    @autotune(
+        configs=get_configs(M, N, K),
+        warmup=3,
+        rep=20,
+    )
+    @jit(out_idx=[2],)
+    def kernel(
+        block_M=None,
+        block_N=None,
+        block_K=None,
+        num_stages=None,
+        thread_num=None,
+        policy=None,
+        enable_rasterization=None,
+    ):
+        """
+        The actual kernel to compute C = A @ B^T.
+
+        Parameters
+        ----------
+        block_M : int
+            Block size in M dimension.
+        block_N : int
+            Block size in N dimension.
+        block_K : int
+            Block size in K dimension.
+        num_stages : int
+            Number of pipelined stages (for asynchronous load).
+        thread_num : int
+            Number of threads to use per block.
+        k_pack : int
+            K dimension packing factor to improve memory coalescing.
+
+        Returns
+        -------
+        Function
+            A TVM Tensor Language function (T.prim_func) that computes matmul.
+        """
+        # Use half-precision for input data to reduce memory bandwidth,
+        # accumulate in float for better numerical accuracy
+        dtype = "float16"
+        e_factor, e_dtype = ARCH_INFO[arch]
+
+        @T.prim_func
+        def main(
+                A_sparse: T.Tensor((M, K // 2), dtype),
+                E: T.Tensor((M, K // e_factor), e_dtype),
+                B: T.Tensor((K, N), dtype),
+                C: T.Tensor((M, N), accum_dtype),
+        ):
+            """
+            The compiled TVM function for block-level matrix multiplication.
+
+            - We divide the entire (M, N) domain into blocks of shape
+              (block_M, block_N).
+            - Each block has its own allocated shared memory for sub-blocks
+              of A and B.
+            - The partial results go into C_local, and then we copy them back
+              to global memory C.
+            """
+            # Bind x-dimension to block index in N,
+            #     y-dimension to block index in M.
+            with T.Kernel(
+                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+
+                # Allocate shared memory for A sub-block of shape (block_M, block_K)
+                A_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+                # Allocate shared memory for B sub-block of shape (block_N, block_K)
+                B_shared = T.alloc_shared((block_K, block_N), dtype)
+                # Allocate shared memory for E sub-block of shape (block_M, block_K // E_factor)
+                E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+                # Allocate a local fragment for intermediate accumulation
+                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+                # Allocate a shared memory for C sub-block of shape (block_M, block_N)
+                C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
+
+                # Clear out the accumulation buffer
+                T.clear(C_local)
+                T.disable_warp_group_reg_alloc()
+
+                T.use_swizzle(panel_size=10, enable=enable_rasterization)
+                T.annotate_layout({
+                    E:
+                        make_metadata_layout(
+                            E, mma_dtype="float16", backend="cutlass", block_k=block_K),
+                    E_shared:
+                        make_metadata_layout(
+                            E_shared, mma_dtype="float16", backend="cutlass", block_k=block_K),
+                })
+                # Loop over sub-blocks in K dimension, pipelined by num_stages
+                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                    # Load a sub-block of A from global memory into A_shared
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                    # Load a sub-block of E from global memory into E_shared
+                    T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
+                    # Load a sub-block of B from global memory into B_shared
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                    # Perform a partial matrix multiplication:
+                    #   C_local += A_shared @ B_shared
+                    T.gemm_sp(
+                        A_shared,
+                        E_shared,
+                        B_shared,
+                        C_local,
+                        transpose_B=False,
+                        policy=policy,
+                    )
+                # Write back the results from C_local to the global memory C
+                T.copy(C_local, C_shared)
+                T.copy(C_shared, C[by * block_M, bx * block_N])
+
+        return main
+
+    return kernel()
+
+
+if __name__ == "__main__":
+    # Parse command-line arguments for matrix dimensions
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument("--disable_cache", action="store_true")
+    parser.add_argument(
+        "--accum_dtype",
+        type=str,
+        default="float",
+        choices=["float", "float16"],
+        help="Accumulation datatype")
+    parser.add_argument(
+        "--bench_torch_sparse",
+        type=str,
+        choices=['cutlass', 'cusparselt'],
+        default=None,
+        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported"
+    )
+    args = parser.parse_args()
+
+    if args.disable_cache:
+        tilelang.disable_cache()
+
+    M, N, K = args.m, args.n, args.k
+
+    # Compute total floating-point operations to measure throughput
+    total_flops = 2 * M * N * K
+
+    # matmul(...) returns (best_latency, best_config, ref_latency)
+    best_result = matmul_sp(M, N, K, args.accum_dtype)
+    best_latency = best_result.latency
+    best_config = best_result.config
+    A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+    B = torch.randn(K, N, dtype=torch.float16, device="cuda")
+    ref_latency = do_bench(lambda: A @ B)
+
+    if args.bench_torch_sparse is not None:
+        from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
+        if args.bench_torch_sparse == 'cutlass':
+            SparseSemiStructuredTensor._FORCE_CUTLASS = True
+        A_sp = to_sparse_semi_structured(A, transposed=False)
+        torch_sparse_latency = do_bench(lambda: A_sp @ B)
+
+    # Print out the benchmark results
+    print(f"Best latency (s): {best_latency}")
+    print(f"Best TFlops: {total_flops / best_latency * 1e-9:.3f}")
+    print(f"Best config: {best_config}")
+
+    if args.bench_torch_sparse is not None:
+        print(
+            f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}"
+        )
+
+    print(f"Reference Dense TFlops: {total_flops / ref_latency * 1e-9:.3f}")
--- a/benchmark/matmul_fp8/README.md
+++ b/benchmark/matmul_fp8/README.md
+# FP8 Matmul Benchmark (8192×8192)
+
+This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP8 matrices sized `M = N = 8192` across different `K` dimensions. Each measurement relies on the default autotuning search space bundled with the benchmark.
+
+## Environment
+
+- Repository commit: `6b1faf71faf18c564f5f77e0f5c1671cd91dfbc3`
+- GPUs: `NVIDIA H800 SXM` on driver `560.35.05`
+
+## How to Reproduce
+
+```bash
+cd benchmark/matmul_fp8
+python - <<'PY'
+from benchmark_matmul import matmul
+
+M = 8192
+N = 8192
+for K in [256, 512, 1024, 2048, 4096, 8192, 16384]:
+    res = matmul(M, N, K, False)
+    tflops = 2 * M * N * K / res.latency * 1e-12
+    print(f"K={K:5d}  latency={res.latency:.6f}s  TFlops={tflops:.3f}")
+PY
+```
+
+## Results
+
+| K     | Latency (s) | Throughput (TFLOPs) |
+|-------|-------------|---------------------|
+|   256 | 0.060352    | 569                 |
+|   512 | 0.080096    | 858                 |
+|  1024 | 0.121696    | 1129                |
+|  2048 | 0.204672    | 1343                |
+|  4096 | 0.374816    | 1467                |
+|  8192 | 0.729664    | 1507                |
+| 16384 | 1.427264    | 1541                |
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
+import argparse
+import itertools
+import logging
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+from tilelang import jit
+
+# Configure logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+def ref_program(A, B):
+    """
+    A reference matrix multiplication program, used to compare performance.
+
+    Parameters
+    ----------
+    A : numpy.ndarray
+        The matrix with shape (M, K).
+    B : numpy.ndarray
+        The matrix with shape (N, K).
+
+    Returns
+    -------
+    np.ndarray
+        The result of A @ B.T, shape (M, N).
+    """
+    return A.float() @ B.T.float()
+
+
+def get_configs(args, kwargs):
+    """
+    Generate a list of configuration dictionaries that will be used for tuning.
+
+    Parameters
+    ----------
+    with_roller : bool
+        Whether to enable bitblas roller to deduce search spaces
+
+    Returns
+    -------
+    list of dict
+        Each configuration dict includes various block sizes, pipeline stages,
+        thread numbers, and other parameters to explore during autotuning.
+    """
+    M, N, K, with_roller = args[:4]
+
+    if with_roller:
+        from tilelang.carver.template import MatmulTemplate
+        from tilelang.carver.arch import CUDA
+        from tilelang.carver.arch import CDNA
+        from tilelang.carver.roller.rasterization import NoRasterization
+        import torch
+
+        arch = CDNA("hip") if torch.version.hip is not None else CUDA("cuda")
+
+        topk = 10
+
+        carve_template = MatmulTemplate(
+            M=M,
+            N=N,
+            K=K,
+            in_dtype="float16",
+            out_dtype="float16",
+            accum_dtype="float",
+        ).with_arch(arch)
+
+        func = carve_template.equivalent_function()
+        assert func is not None, "Function is None"
+
+        roller_hints = carve_template.recommend_hints(topk=topk)
+
+        if roller_hints is None:
+            raise ValueError("No Roller Hints Found for TensorCore Scheduling")
+
+        configs = []
+        for hint in roller_hints:
+            config = {}
+            block_m, block_n = hint.block
+            warp_m, warp_n = hint.warp
+            # block_rows, block_cols represents warp partitioning
+            block_rows, block_cols = block_m // warp_m, block_n // warp_n
+            config["block_M"] = block_m
+            config["block_N"] = block_n
+            config["block_K"] = hint.rstep[0]
+            config["num_stages"] = hint.pipeline_stage
+            config["thread_num"] = block_rows * block_cols * 32
+            config["policy"] = T.GemmWarpPolicy.from_warp_partition(block_rows, block_cols)
+            config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
+            configs.append(config)
+        for config in configs:
+            print(config)
+    else:
+        iter_params = dict(
+            block_M=[64, 128, 256],
+            block_N=[64, 128, 256],
+            block_K=[64, 128],
+            num_stages=[0, 1, 2, 3],
+            thread_num=[128, 256],
+            policy=[T.GemmWarpPolicy.Square],
+            enable_rasteration=[True, False],
+        )
+        return [{
+            k: v for k, v in zip(iter_params, values)
+        } for values in itertools.product(*iter_params.values())]
+
+    return configs
+
+
+@autotune(
+    configs=get_configs,
+    warmup=3,
+    rep=20,
+)
+@jit(out_idx=[2],)
+def matmul(
+    M,
+    N,
+    K,
+    with_roller,
+    block_M=None,
+    block_N=None,
+    block_K=None,
+    num_stages=None,
+    thread_num=None,
+    policy=None,
+    enable_rasteration=None,
+):
+    """
+    Create an autotuned matrix multiplication kernel for matrices of shape:
+      - A: (M, K)
+      - B: (N, K)
+      - C: (M, N)
+
+    Parameters
+    ----------
+    M : int
+        The dimension M of the matrix multiplication.
+    N : int
+        The dimension N of the matrix multiplication.
+    K : int
+        The dimension K of the matrix multiplication.
+
+    Returns
+    -------
+    (best_latency, best_config, ref_latency)
+        best_latency : float
+            The best latency found among the tuned configurations.
+        best_config : dict
+            The parameter configuration that yielded best_latency.
+        ref_latency : float
+            The baseline latency of the reference program (for computing speedup).
+    """
+
+    # Use half-precision for input data to reduce memory bandwidth,
+    # accumulate in float for better numerical accuracy
+    dtype = "float8_e4m3"
+    accum_dtype = "float"
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        """
+        The compiled TVM function for block-level matrix multiplication.
+
+        - We divide the entire (M, N) domain into blocks of shape
+            (block_M, block_N).
+        - Each block has its own allocated shared memory for sub-blocks
+            of A and B.
+        - The partial results go into C_local, and then we copy them back
+            to global memory C.
+        """
+        # Bind x-dimension to block index in N,
+        #     y-dimension to block index in M.
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+
+            # Allocate shared memory for A sub-block of shape (block_M, block_K)
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            # Allocate shared memory for B sub-block of shape (block_N, block_K)
+            B_shared = T.alloc_shared((block_N, block_K), dtype)
+            # Allocate a local fragment for intermediate accumulation
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            # Allocate a shared memory for C sub-block of shape (block_M, block_N)
+            C_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            # Enable (or disable) swizzling optimization
+            T.use_swizzle(panel_size=10, enable=enable_rasteration)
+            # to utilize swizzle tma layout
+            T.annotate_layout({C_shared: tilelang.layout.make_swizzled_layout(C_shared)})
+
+            # Clear out the accumulation buffer
+            T.clear(C_local)
+
+            # Loop over sub-blocks in K dimension, pipelined by num_stages
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                # Load a sub-block of A from global memory into A_shared
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                # Load a sub-block of B from global memory into B_shared
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                # Perform a partial matrix multiplication:
+                #   C_local += A_shared @ B_shared^T
+                T.gemm(
+                    A_shared,
+                    B_shared,
+                    C_local,
+                    transpose_B=True,
+                    policy=policy,
+                )
+            # Write back the results from C_local to the global memory C
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+if __name__ == "__main__":
+    # Parse command-line arguments for matrix dimensions
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument(
+        "--with_roller",
+        action="store_true",
+        help="Whether to enable BitBLAS roller for search space",
+    )
+    args = parser.parse_args()
+
+    M, N, K = args.m, args.n, args.k
+    with_roller = args.with_roller
+
+    # Compute total floating-point operations to measure throughput
+    total_flops = 2 * M * N * K
+
+    # matmul(...) returns (best_latency, best_config, ref_latency)
+    best_result = matmul(M, N, K, with_roller)
+    best_latency = best_result.latency
+    best_config = best_result.config
+
+    # Print out the benchmark results
+    print(f"Best latency (s): {best_latency}")
+    print(f"Best TFlops: {total_flops / best_latency * 1e-9:.3f}")
+    print(f"Best config: {best_config}")
--- a/cmake/load_tvm.cmake
+++ b/cmake/load_tvm.cmake
+# todo: support prebuilt tvm
+
+set(TVM_BUILD_FROM_SOURCE TRUE)
+set(TVM_SOURCE ${CMAKE_SOURCE_DIR}/3rdparty/tvm)
+
+if(DEFINED $ENV{TVM_ROOT})
+  if(EXISTS $ENV{TVM_ROOT}/cmake/config.cmake)
+    set(TVM_SOURCE $ENV{TVM_ROOT})
+  endif()
+endif()
+
+set(TVM_INCLUDES
+  ${TVM_SOURCE}/include
+  ${TVM_SOURCE}/ffi/include
+  ${TVM_SOURCE}/src
+  ${TVM_SOURCE}/3rdparty/dlpack/include
+  ${TVM_SOURCE}/3rdparty/dmlc-core/include
+)
--- a/docker/Dockerfile.cu118
+++ b/docker/Dockerfile.cu118
+FROM nvcr.io/nvidia/pytorch:22.12-py3 
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.cu120
+++ b/docker/Dockerfile.cu120
+FROM nvcr.io/nvidia/pytorch:23.01-py3 
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.cu121
+++ b/docker/Dockerfile.cu121
+FROM nvcr.io/nvidia/pytorch:23.07-py3
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.cu123
+++ b/docker/Dockerfile.cu123
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.cu124
+++ b/docker/Dockerfile.cu124
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.cu125
+++ b/docker/Dockerfile.cu125
+FROM nvcr.io/nvidia/pytorch:24.07-py3
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.cu126
+++ b/docker/Dockerfile.cu126
+FROM nvcr.io/nvidia/pytorch:24.12-py3
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.cu128
+++ b/docker/Dockerfile.cu128
+FROM nvcr.io/nvidia/pytorch:25.01-py3
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O install_miniconda.sh && \
+  bash install_miniconda.sh -b -p /opt/conda && rm install_miniconda.sh
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
+  && cd TileLang && ./install_cuda.sh
+
+CMD bash
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
+FROM rocm/pytorch:rocm6.3.2_ubuntu22.04_py3.10_pytorch_release_2.4.0
+
+WORKDIR /root
+
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget \
+  libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
+
+ENV PATH="/opt/conda/bin:${PATH}"
+ENV LIBGL_ALWAYS_INDIRECT=1
+
+
+RUN conda run -n py_3.10 conda install pip cmake -y && \
+    conda run -n py_3.10 conda install -c conda-forge libstdcxx-ng=12 -y && \
+    conda clean --all
+
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+
+RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main tilelang && \
+    conda run -n py_3.10 bash -c "cd tilelang && ./install_rocm.sh"
+
+RUN conda init bash
+
+SHELL ["/bin/bash", "-l", "-c"]
+
+CMD ["bash", "-c", "source ~/.bashrc && conda activate py_3.10 && exec bash"]
\ No newline at end of file
--- a/docker/README.md
+++ b/docker/README.md
+To ease the process of installing all the dependencies, we provide a Dockerfile and a simple guideline to build a Docker image with all of above installed. The Docker image is built on top of Ubuntu 20.04, and it contains all the dependencies required to run the experiments. We only provide the Dockerfile for NVIDIA GPU, and the Dockerfile for AMD GPU will be provided upon request.
+
+```bash
+git clone --recursive https://github.com/tile-ai/tilelang TileLang
+cd TileLang/docker
+# build the image, this may take a while (around 10+ minutes on our test machine)
+# replace the version number cu124 with the one you want to use
+# replace .cu** with .rocm for AMD GPU
+docker build -t tilelang_workspace -f Dockerfile.cu124 .
+# run the container
+# if it's nvidia
+docker run -it --cap-add=SYS_ADMIN --network=host --gpus all --cap-add=SYS_PTRACE --shm-size=4G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --name tilelang_test tilelang_workspace bash
+# if it's amd
+docker run -it --cap-add=SYS_ADMIN --network=host --device=/dev/kfd --device=/dev/dri  --cap-add=SYS_PTRACE --shm-size=4G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --name tilelang_test tilelang_workspace bash
+```
--- a/docs/.gitignore
+++ b/docs/.gitignore
+_build/
+autoapi/
\ No newline at end of file
--- a/docs/CNAME
+++ b/docs/CNAME
+tilelang.com
\ No newline at end of file
--- a/docs/Makefile
+++ b/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= python -m sphinx
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile clean
+
+# The "clean" target is updated to remove the autoapi generated files as well.
+# Run "make clean" to ensure a completely fresh build.
+clean:
+	rm -rf $(BUILDDIR) autoapi
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
+# Tile Language Documentation
+
+The documentation was built upon [Sphinx](https://www.sphinx-doc.org/en/master/).
+
+## Dependencies
+
+Run the following command in this directory to install dependencies first:
+
+```bash
+pip3 install -r requirements.txt
+```
+
+## Build the Documentation
+
+Then you can build the documentation by running:
+
+```bash
+make html
+```
+
+## View the Documentation
+
+Run the following command to start a simple HTTP server:
+
+```bash
+cd _build/html
+python3 -m http.server
+```
+
+Then you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending ` -p PORT_NUMBER` in the python command above).
--- a/docs/_static/img/LayoutInference.png
+++ b/docs/_static/img/LayoutInference.png