[CI][Test] Add test cases for element_add (#47)

* [CI][Test] Add test cases for element_add * [Doc] fix typo * Parallelization * format * remove useless condition * format

[CI][Test] Add test cases for element_add (#47)
* [CI][Test] Add test cases for element_add * [Doc] fix typo * Parallelization * format * remove useless condition * format
f944b79e · Cunxiao Ni · GitHub · bedab1a0 · f944b79e · f944b79e
Commit f944b79e authored Jan 26, 2025 by Cunxiao Ni Committed by GitHub Jan 26, 2025
Showing with 120 additions and 8 deletions

examples/quickstart.py examples/quickstart.py +8 -8

testing/python/kernel/test_tilelang_kernel_element_wise_add.py ...ng/python/kernel/test_tilelang_kernel_element_wise_add.py +112 -0

No files found.
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -3,25 +3,26 @@
 import tilelang
 import tilelang.language as T
 # `make_mma_swizzle_layout` is a python defined layout function
-# specifically designed for for MMA operations
+# specifically designed for MMA operations
 # which ensures the consistency with the nvidia CUTLASS Library.
 # to avoid bank conflicts and maximize the performance.
 from tilelang.intrinsics import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,)  # noqa: F401
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
    # add decorator @tilelang.jit if you want to return a torch function
    @T.prim_func
    def main(
-        A: T.Buffer((M, K), dtype),
+            A: T.Buffer((M, K), dtype),
-        B: T.Buffer((K, N), dtype),
+            B: T.Buffer((K, N), dtype),
-        C: T.Buffer((M, N), dtype),
+            C: T.Buffer((M, N), dtype),
    ):
        # Initialize Kernel Context
        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
            A_shared = T.alloc_shared((block_M, block_K), dtype)
            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local  = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
            # Apply layout optimizations or define your own layout (Optional)
            # If not specified, we will deduce the layout automatically
@@ -71,7 +72,6 @@ import torch
 a = torch.randn(1024, 1024, device="cuda", dtype=torch.float16)
 b = torch.randn(1024, 1024, device="cuda", dtype=torch.float16)
 # Run the kernel through the Profiler
 c = jit_kernel(a, b)
@@ -86,7 +86,7 @@ print("Kernel output matches PyTorch reference.")
 cuda_source = jit_kernel.get_kernel_source()
 print("Generated CUDA kernel:\n", cuda_source)
-# 5.Pofile latency with kernel
+# 5.Profile latency with kernel
 profiler = jit_kernel.get_profiler()
 latency = profiler.do_bench()

--- a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
+++ b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang as tl
+import torch
+def elementwise_add(
+    M,
+    N,
+    block_M,
+    block_N,
+    in_dtype,
+    out_dtype,
+    threads,
+):
+    import tilelang.language as T
+    @T.prim_func
+    def main(
+            A: T.Buffer((M, N), in_dtype),
+            B: T.Buffer((M, N), in_dtype),
+            C: T.Buffer((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            start_x = bx * block_N
+            start_y = by * block_M
+            for (local_y, local_x) in T.Parallel(block_M, block_N):
+                y = start_y + local_y
+                x = start_x + local_x
+                C[y, x] = A[y, x] + B[y, x]
+    return main
+def run_elementwise_add(
+    M,
+    N,
+    in_dtype,
+    out_dtype,
+    block_M,
+    block_N,
+    num_threads=128,
+):
+    program = elementwise_add(
+        M,
+        N,
+        block_M,
+        block_N,
+        in_dtype,
+        out_dtype,
+        num_threads,
+    )
+    mod, params = tl.lower(program)
+    mod = tl.Profiler(mod, params, [2], tl.TensorSupplyType.Integer)
+    def ref_program(A, B):
+        C = torch.add(A, B)
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+    mod.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+def test_elementwise_add_f32():
+    run_elementwise_add(
+        512,
+        1024,
+        "float32",
+        "float32",
+        128,
+        256,
+    )
+def test_elementwise_add_f16():
+    run_elementwise_add(
+        512,
+        1024,
+        "float16",
+        "float16",
+        128,
+        256,
+    )
+def test_elementwise_add_i32():
+    run_elementwise_add(
+        512,
+        1024,
+        "int32",
+        "int32",
+        128,
+        256,
+    )
+def test_elementwise_add_f32f16():
+    run_elementwise_add(
+        512,
+        1024,
+        "float32",
+        "float16",
+        128,
+        256,
+    )
+if __name__ == "__main__":
+    tilelang.testing.main()