[Example] Introduce autotuning example for GEMM with enhanced configuration options (#360)

* Added a new example script `example_gemm_autotune.py` to demonstrate autotuning for matrix multiplication (GEMM) using TileLang. * Implemented functions for generating configurations, selecting the best configuration, and benchmarking performance. * Refactored the existing `matmul` function to support dynamic configuration parameters and improved kernel compilation. * Updated the main execution block to include command-line argument parsing for matrix dimensions and autotuning options. * Enhanced the example to validate results against a reference implementation, ensuring correctness in matrix multiplication operations.

[Example] Introduce autotuning example for GEMM with enhanced configuration options (#360)
* Added a new example script `example_gemm_autotune.py` to demonstrate autotuning for matrix multiplication (GEMM) using TileLang. * Implemented functions for generating configurations, selecting the best configuration, and benchmarking performance. * Refactored the existing `matmul` function to support dynamic configuration parameters and improved kernel compilation. * Updated the main execution block to include command-line argument parsing for matrix dimensions and autotuning options. * Enhanced the example to validate results against a reference implementation, ensuring correctness in matrix multiplication operations.
d4194222 · Yu Cheng · LeiWang1999 · 7fdcedd0 · d4194222 · d4194222
Commit d4194222 authored Apr 09, 2025 by Yu Cheng Committed by LeiWang1999 Apr 09, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 285 additions and 231 deletions

examples/gemm/example_gemm.py examples/gemm/example_gemm.py +39 -231

examples/gemm/example_gemm_autotune.py examples/gemm/example_gemm_autotune.py +246 -0

No files found.
--- a/examples/gemm/example_gemm.py
+++ b/examples/gemm/example_gemm.py
-import argparse
-import torch
-import itertools
-import tilelang as tl
+import tilelang
 import tilelang.language as T
-from tilelang.autotuner import AutoTuner
-from tilelang.carver.template import MatmulTemplate
-from tilelang.carver.arch import CUDA
-from tilelang.carver.roller.rasterization import NoRasterization
-
-
-def ref_program(A, B):
-    return A @ B.T
-
-
-def get_configs(M, N, K, with_roller=False, topk=20):
-    if with_roller:
-        arch = CUDA("cuda")
-        carve_template = MatmulTemplate(
-            M=M,
-            N=N,
-            K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
-        ).with_arch(arch)
-
-        func = carve_template.equivalent_function()
-        assert func is not None, "Function is None"
-        roller_hints = carve_template.recommend_hints(topk=topk)
-        if roller_hints is None:
-            raise ValueError("No Roller Hints Found for TensorCore Scheduling")
-        configs = []
-        for hint in roller_hints:
-            config = {}
-            block_m, block_n = hint.block
-            warp_m, warp_n = hint.warp
-            # block_rows, block_cols represents warp partitioning
-            block_rows, block_cols = block_m // warp_m, block_n // warp_n
-            config["block_M"] = block_m
-            config["block_N"] = block_n
-            config["block_K"] = hint.rstep[0]
-            config["num_stages"] = hint.pipeline_stage if hint.pipeline_stage > 1 else 0
-            config["thread_num"] = block_rows * block_cols * 32
-            config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
-            configs.append(config)
-        for config in configs:
-            print(config)
-    else:
-        block_M = [64, 128, 256]
-        block_N = [64, 128, 256]
-        block_K = [32, 64]
-        num_stages = [0, 1, 2, 3]
-        thread_num = [128, 256]
-        enable_rasterization = [True, False]
-        _configs = list(
-            itertools.product(
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                thread_num,
-                enable_rasterization,
-            ))
-
-        configs = [
-            {
-                "block_M": c[0],
-                "block_N": c[1],
-                "block_K": c[2],
-                "num_stages": c[3],
-                "thread_num": c[4],
-                "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
-        ]
-    return configs
-
-
-def get_best_config(M, N, K, with_roller=False):
-
-    def kernel(
-        block_M=None,
-        block_N=None,
-        block_K=None,
-        num_stages=None,
-        thread_num=None,
-        enable_rasteration=None,
-    ):
-        dtype = "float16"
-        accum_dtype = "float"
-
-        @T.prim_func
-        def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
-        ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-                A_shared = T.alloc_shared((block_M, block_K), dtype)
-                B_shared = T.alloc_shared((block_N, block_K), dtype)
-                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-                C_shared = T.alloc_shared((block_M, block_N), dtype)
-                T.use_swizzle(panel_size=10, enable=enable_rasteration)
-                T.clear(C_local)
-                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.gemm(
-                        A_shared,
-                        B_shared,
-                        C_local,
-                        transpose_B=True,
-                    )
-                T.copy(C_local, C_shared)
-                T.copy(C_shared, C[by * block_M, bx * block_N])
-
-        return main
-
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
-            out_idx=[-1],
-            supply_type=tl.TensorSupplyType.Integer,
-            ref_prog=ref_program,
-            skip_check=False,
-            target="auto",
-        )
-    return autotuner.run(warmup=3, rep=20)
-
-
-def get_heuristic_config() -> dict:
-    # Get CUDA device properties
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA is not available")
-    device = torch.cuda.current_device()
-    sm_major, sm_minor = torch.cuda.get_device_capability(device)
-    sm_version = sm_major * 10 + sm_minor
-    print(f"CUDA device capability: {sm_version}")
-    if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
-    elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
-    else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
-
-
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           thread_num,
-           enable_rasteration,
-           dtype="float16",
-           accum_dtype="float"):
+
+
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):

    @T.prim_func
    def main(
            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
+            B: T.Tensor((K, N), dtype),
            C: T.Tensor((M, N), dtype),
    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_N, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            C_shared = T.alloc_shared((block_M, block_N), dtype)
-            T.use_swizzle(panel_size=10, enable=enable_rasteration)
+
            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                )
-            T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M, bx * block_N])
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])

    return main


-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
-    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
-    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=True,
-        help="Whether to enable BitBLAS roller for search space")
-    args = parser.parse_args()
-    M, N, K = args.m, args.n, args.k
-    a = torch.randn(M, K).cuda().half()
-    b = torch.randn(N, K).cuda().half()
-    use_autotune = args.use_autotune
-    use_autotune = True
-    with_roller = args.with_roller
-    if use_autotune:
-        result = get_best_config(M, N, K, with_roller)
-        print(result.config)
-        kernel = result.kernel
-    else:
-        config = get_heuristic_config()
-        kernel = tl.compile(matmul(M, N, K, **config), out_idx=-1)
-
-    # benchmark
-    profiler = kernel.get_profiler(tensor_supply_type=tl.TensorSupplyType.Auto)
-    tilelang_latency = profiler.do_bench()
-    ref_latency = profiler.do_bench(ref_program)
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-    print(f"TileLang latency: {tilelang_latency}")
-    print(f"Ref latency: {ref_latency}")
-    print(f"TileLang TFlops: {2 * M * N * K / tilelang_latency * 1e-9}")
-    print(f"Ref TFlops: {2 * M * N * K / ref_latency * 1e-9}")
+func = matmul(1024, 1024, 1024, 128, 128, 32)
+
+print(func)
+
+kernel = tilelang.compile(func, out_idx=-1)
+
+import torch
+
+a = torch.randn(1024, 1024).cuda().half()
+b = torch.randn(1024, 1024).cuda().half()
+
+c = kernel(a, b)
+
+ref_c = a @ b
+
+print("c:")
+print(c)
+print("ref_c:")
+print(ref_c)
+
+torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+print("All check passed.")
+
+# Get CUDA Source
+print("CUDA Source:")
+print(kernel.get_kernel_source())
\ No newline at end of file
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
+import argparse
+import torch
+import itertools
+import tilelang as tl
+import tilelang.language as T
+from tilelang.autotuner import AutoTuner
+from tilelang.carver.template import MatmulTemplate
+from tilelang.carver.arch import CUDA
+from tilelang.carver.roller.rasterization import NoRasterization
+
+
+def ref_program(A, B):
+    return A @ B.T
+
+
+def get_configs(M, N, K, with_roller=False, topk=20):
+    if with_roller:
+        arch = CUDA("cuda")
+        carve_template = MatmulTemplate(
+            M=M,
+            N=N,
+            K=K,
+            in_dtype="float16",
+            out_dtype="float16",
+            accum_dtype="float",
+        ).with_arch(arch)
+
+        func = carve_template.equivalent_function()
+        assert func is not None, "Function is None"
+        roller_hints = carve_template.recommend_hints(topk=topk)
+        if roller_hints is None:
+            raise ValueError("No Roller Hints Found for TensorCore Scheduling")
+        configs = []
+        for hint in roller_hints:
+            config = {}
+            block_m, block_n = hint.block
+            warp_m, warp_n = hint.warp
+            # block_rows, block_cols represents warp partitioning
+            block_rows, block_cols = block_m // warp_m, block_n // warp_n
+            config["block_M"] = block_m
+            config["block_N"] = block_n
+            config["block_K"] = hint.rstep[0]
+            config["num_stages"] = hint.pipeline_stage if hint.pipeline_stage > 1 else 0
+            config["thread_num"] = block_rows * block_cols * 32
+            config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
+            configs.append(config)
+        for config in configs:
+            print(config)
+    else:
+        block_M = [64, 128, 256]
+        block_N = [64, 128, 256]
+        block_K = [32, 64]
+        num_stages = [0, 1, 2, 3]
+        thread_num = [128, 256]
+        enable_rasterization = [True, False]
+        _configs = list(
+            itertools.product(
+                block_M,
+                block_N,
+                block_K,
+                num_stages,
+                thread_num,
+                enable_rasterization,
+            ))
+
+        configs = [
+            {
+                "block_M": c[0],
+                "block_N": c[1],
+                "block_K": c[2],
+                "num_stages": c[3],
+                "thread_num": c[4],
+                "enable_rasteration": c[5],  # keep param name for backward-compat
+            } for c in _configs
+        ]
+    return configs
+
+
+def get_best_config(M, N, K, with_roller=False):
+
+    def kernel(
+        block_M=None,
+        block_N=None,
+        block_K=None,
+        num_stages=None,
+        thread_num=None,
+        enable_rasteration=None,
+    ):
+        dtype = "float16"
+        accum_dtype = "float"
+
+        @T.prim_func
+        def main(
+                A: T.Tensor((M, K), dtype),
+                B: T.Tensor((N, K), dtype),
+                C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(
+                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+                A_shared = T.alloc_shared((block_M, block_K), dtype)
+                B_shared = T.alloc_shared((block_N, block_K), dtype)
+                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+                C_shared = T.alloc_shared((block_M, block_N), dtype)
+                T.use_swizzle(panel_size=10, enable=enable_rasteration)
+                T.clear(C_local)
+                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                    T.gemm(
+                        A_shared,
+                        B_shared,
+                        C_local,
+                        transpose_B=True,
+                    )
+                T.copy(C_local, C_shared)
+                T.copy(C_shared, C[by * block_M, bx * block_N])
+
+        return main
+
+    autotuner = AutoTuner.from_kernel(
+        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+            out_idx=[-1],
+            supply_type=tl.TensorSupplyType.Integer,
+            ref_prog=ref_program,
+            skip_check=False,
+            target="auto",
+        )
+    return autotuner.run(warmup=3, rep=20)
+
+
+def get_heuristic_config() -> dict:
+    # Get CUDA device properties
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    device = torch.cuda.current_device()
+    sm_major, sm_minor = torch.cuda.get_device_capability(device)
+    sm_version = sm_major * 10 + sm_minor
+    print(f"CUDA device capability: {sm_version}")
+    if sm_version in {80}:
+        return {
+            "block_M": 128,
+            "block_N": 256,
+            "block_K": 32,
+            "num_stages": 2,
+            "thread_num": 128,
+            "enable_rasteration": True
+        }
+    elif sm_version in {90}:
+        return {
+            "block_M": 128,
+            "block_N": 256,
+            "block_K": 64,
+            "num_stages": 3,
+            "thread_num": 256,
+            "enable_rasteration": True
+        }
+    else:
+        return {
+            "block_M": 128,
+            "block_N": 256,
+            "block_K": 32,
+            "num_stages": 0,
+            "thread_num": 128,
+            "enable_rasteration": True
+        }
+
+
+def matmul(M,
+           N,
+           K,
+           block_M,
+           block_N,
+           block_K,
+           num_stages,
+           thread_num,
+           enable_rasteration,
+           dtype="float16",
+           accum_dtype="float"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_N, block_K), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), dtype)
+            T.use_swizzle(panel_size=10, enable=enable_rasteration)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm(
+                    A_shared,
+                    B_shared,
+                    C_local,
+                    transpose_B=True,
+                )
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument(
+        "--use_autotune",
+        action="store_true",
+        default=False,
+        help="Whether to use autotune for matmul configs")
+    parser.add_argument(
+        "--with_roller",
+        action="store_true",
+        default=True,
+        help="Whether to enable BitBLAS roller for search space")
+    args = parser.parse_args()
+    M, N, K = args.m, args.n, args.k
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(N, K).cuda().half()
+    use_autotune = args.use_autotune
+    use_autotune = True
+    with_roller = args.with_roller
+    if use_autotune:
+        result = get_best_config(M, N, K, with_roller)
+        print(result.config)
+        kernel = result.kernel
+    else:
+        config = get_heuristic_config()
+        kernel = tl.compile(matmul(M, N, K, **config), out_idx=-1)
+
+    # benchmark
+    profiler = kernel.get_profiler(tensor_supply_type=tl.TensorSupplyType.Auto)
+    tilelang_latency = profiler.do_bench()
+    ref_latency = profiler.do_bench(ref_program)
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+    print(f"TileLang latency: {tilelang_latency}")
+    print(f"Ref latency: {ref_latency}")
+    print(f"TileLang TFlops: {2 * M * N * K / tilelang_latency * 1e-9}")
+    print(f"Ref TFlops: {2 * M * N * K / ref_latency * 1e-9}")