[Bugfix] Fix `K // block_K` to T.ceildiv(K,block_K) and add tests (#210)

e2bc1cb6 · Yuxuan Hu · LeiWang1999 · 227ed7ec · e2bc1cb6
Commit e2bc1cb6 authored Mar 14, 2025 by Yuxuan Hu Committed by LeiWang1999 Mar 14, 2025
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 3 deletions

testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py +8 -3

No files found.
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -119,7 +119,7 @@ def tl_matmul(

            T.clear(C_local)

-            for ko in T.Pipelined((K // block_K), num_stages=stage):
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):

                # Load A into shared memory
                for i, k in T.Parallel(block_M, block_K):
@@ -182,7 +182,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
    mod = TL.Profiler(mod, params, [], TL.TensorSupplyType.Integer)
    mod(compressed_A, compressed_B, C)
    print(C)
-    latency = mod.do_bench(mod.func, warmup=25, profiler="tvm")
+    latency = mod.do_bench(mod.func, warmup=25)
    print(latency)
    # Ensure that the latency is not None
    assert latency is not None
@@ -194,6 +194,11 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)


+def test_assert_tl_matmul_correctness():
+    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 64, "int8", "int32", "int32")
+
+
 @simplify_prim_func
 def tl_matmul_weight_only_transform(
    M,
@@ -302,7 +307,7 @@ def tl_matmul_weight_only_transform(

            T.clear(C_local)

-            for ko in T.Pipelined((K // block_K), num_stages=stage):
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):

                # Load A into shared memory
                for i, k in T.Parallel(block_M, block_K):