Merge branch 'main' of https://github.com/microsoft/TileLang into main

549416f7 · LeiWang1999 · 4d63633a · 7fad4e88 · 549416f7 · 549416f7
Commit 549416f7 authored Jan 11, 2025 by LeiWang1999
10 changed files
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -10,12 +10,8 @@ def atomic_add(dst, value):
 def atomic_addx2(dst, value):
-    return T.call_extern(
+    return T.call_extern("handle", "atomicAddx2", T.address_of(dst), T.address_of(value))
-        "handle", "atomicAddx2", T.address_of(dst), T.address_of(value)
-    )
 def dp4a(A, B, C):
-    return T.call_extern(
+    return T.call_extern("handle", "DP4A", T.address_of(A), T.address_of(B), T.address_of(C))
-        "handle", "DP4A", T.address_of(A), T.address_of(B), T.address_of(C)
-    )
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -4,6 +4,7 @@
 from tvm import tir
 class GemmWarpPolicy:
    Square = 0
    FullRow = 1

--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -145,6 +145,7 @@ class KernelLaunchFrame(TIRFrame):
        """
        return self.get_num_threads()
 def Kernel(
    *blocks: List[tir.PrimExpr],
    threads: Union[int, List[int], Tuple] = 128,

--- a/tilelang/language/pipeline.py
+++ b/tilelang/language/pipeline.py
@@ -45,6 +45,4 @@ def Pipelined(
    if group is None:
        group = []
    # type: ignore[attr-defined] # pylint: disable=no-member
-    return _ffi_api.Pipelined(
+    return _ffi_api.Pipelined(start, stop, num_stages, order, stage, sync, group)
-        start, stop, num_stages, order, stage, sync, group
-    )
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -4,9 +4,8 @@
 from tvm import tir
-def reduce(
-    buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool
+def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
-):
    buffer = buffer.access_ptr("r")
    out = out.access_ptr("w")
    return tir.call_intrin(
@@ -20,9 +19,7 @@ def reduce(
    )
-def reduce_max(
+def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int, clear: bool = True):
-    buffer: tir.Buffer, out: tir.Buffer, dim: int, clear: bool = True
-):
    """Perform reduce max on input buffer, store the result to output buffer
    Parameters
@@ -42,9 +39,7 @@ def reduce_max(
    return reduce(buffer, out, "max", dim, clear)
-def reduce_min(
+def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int, clear: bool = True):
-    buffer: tir.Buffer, out: tir.Buffer, dim: int, clear: bool = True
-):
    return reduce(buffer, out, "min", dim, clear)

--- a/tilelang/layout/swizzle.py
+++ b/tilelang/layout/swizzle.py
@@ -6,6 +6,7 @@
 import tvm
 from tilelang import _ffi_api
 def make_swizzled_layout(buffer: tvm.tir.Buffer):
    assert len(buffer.shape) == 2
    return _ffi_api.make_swizzled_layout(

--- a/tilelang/primitives/gemm/__init__.py
+++ b/tilelang/primitives/gemm/__init__.py
@@ -6,8 +6,8 @@ from tvm import tir
 from tilelang.primitives.utils import is_local, is_fragment, is_shared
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 from tilelang.primitives.gemm.gemm_mma import (
-    GemmPrimitiveMMA,
+    GemmPrimitiveMMA,)
-)
 def gemm(
    A: tir.Buffer,
@@ -24,16 +24,13 @@ def gemm(
    k_pack: int = 1,
 ):
    assert is_local(A) or is_fragment(A) or is_shared(A), (
-        f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}"
+        f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}")
-    )
    assert is_local(B) or is_fragment(B) or is_shared(B), (
-        f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}"
+        f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}")
-    )
    assert is_local(C) or is_fragment(C), (
-        f"Expected C to be a local, fragment, but got {C.scope()}"
+        f"Expected C to be a local, fragment, but got {C.scope()}")
-    )
    # TODO(lei): Now we only support Nvidia GPUs
-    # Must enhance the design to implement runtime lowering 
+    # Must enhance the design to implement runtime lowering
    # for different targets (hip mfma for example)
    return GemmPrimitiveMMA(
        A=A,

--- a/tilelang/primitives/gemm/base.py
+++ b/tilelang/primitives/gemm/base.py
@@ -7,6 +7,7 @@ from dataclasses import dataclass
 from typing import Optional
 from tvm import tir
 class GemmWarpPolicy(IntEnum):
    """
    Enumeration for GEMM Warp Partitioning Policies.
@@ -89,16 +90,12 @@ class GemmWarpPolicy(IntEnum):
        if self.is_full_row():
            # FullRow policy: Allocate all warps to rows.
            m_warp = num_warps
-            assert (
+            assert (M % num_warps == 0), "M must be divisible by num_warps for FullRow policy"
-                M % num_warps == 0
-            ), "M must be divisible by num_warps for FullRow policy"
        elif self.is_full_col():
            # FullCol policy: Allocate all warps to columns.
            n_warp = num_warps
-            assert (
+            assert (N % num_warps == 0), "N must be divisible by num_warps for FullCol policy"
-                N % num_warps == 0
-            ), "N must be divisible by num_warps for FullCol policy"
        elif self.is_square():
            # Square policy: Try to balance warps across rows and columns.
@@ -136,7 +133,7 @@ class GemmBaseParams:
    A: tir.Buffer
    B: tir.Buffer
    C: tir.Buffer
    transpose_A: bool = False
    transpose_B: bool = False
    block_row_warps: Optional[int] = None
@@ -148,7 +145,7 @@ class GemmBaseParams:
    k_pack: int = 1
    def get_warp_size(self) -> int:
-        # must rewrite to 64 if the target 
+        # must rewrite to 64 if the target
        # is cdna mfma
        return 32
@@ -168,7 +165,6 @@ class GemmBaseParams:
            "k_pack": self.k_pack,
        }
    def infer_block_partition(self, threads: Optional[int]) -> None:
        """
        Infer and set block partition parameters (e.g., block_row_warps,
@@ -210,19 +206,13 @@ class GemmBaseParams:
        # Determine whether block partition parameters need to be inferred
        require_infer = (
-            block_row_warps is None
+            block_row_warps is None or block_col_warps is None or warp_row_tiles is None or
-            or block_col_warps is None
+            warp_col_tiles is None or chunk is None)
-            or warp_row_tiles is None
-            or warp_col_tiles is None
-            or chunk is None
-        )
        A_shape, B_shape = A.shape, B.shape
        if require_infer:
-            assert (
+            assert (threads is not None), "threads must be provided for auto inference"
-                threads is not None
-            ), "threads must be provided for auto inference"
            # Auto-inference only supports 2D matrix multiplication
            assert (
                len(A_shape) == 2 and len(B_shape) == 2
@@ -241,28 +231,24 @@ class GemmBaseParams:
            # Infer block partition using a user-specified policy
            block_row_warps, block_col_warps = policy.compute_warp_partition(
-                block_M, block_N, num_warps
+                block_M, block_N, num_warps)
-            )
            warp_row_tiles = block_M // block_row_warps
            warp_col_tiles = block_N // block_col_warps
            chunk = int(AK)
        # rewrite the values
        self.block_row_warps = block_row_warps
        self.block_col_warps = block_col_warps
        self.warp_row_tiles = warp_row_tiles
        self.warp_col_tiles = warp_col_tiles
        self.chunk = chunk
    @property
    def class_attributes(self):
        return self.params_as_dict()
    def __repr__(self) -> str:
        cls_name = self.__class__.__name__
        fields = self.class_attributes
-        field_str = ", ".join(
+        field_str = ", ".join(f"{key}={value!r}" for key, value in fields.items())
-            f"{key}={value!r}" for key, value in fields.items()
-        )
        return f"{cls_name}({field_str})"
--- a/tilelang/primitives/gemm/gemm_mma.py
+++ b/tilelang/primitives/gemm/gemm_mma.py
@@ -11,6 +11,7 @@ from tilelang.primitives.utils import is_fragment, array_reduce
 from tilelang.primitives.gemm.base import GemmBaseParams
 from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
 # TODO(lei): Implement GEMM_SR, GEMM_RS, GEMM_RR
 @dataclass
 class GemmPrimitiveMMA(GemmBaseParams):
@@ -35,7 +36,7 @@ class GemmPrimitiveMMA(GemmBaseParams):
        B: tir.Buffer,
        C: tir.Buffer,
        mma_emitter: TensorCoreIntrinEmitter,
-    )-> tir.PrimExpr:
+    ) -> tir.PrimExpr:
        in_dtype = self.in_dtype
        warp_rows = mma_emitter.warp_rows
@@ -50,9 +51,7 @@ class GemmPrimitiveMMA(GemmBaseParams):
        c_is_fragment = is_fragment(C)
        @T.macro
-        def _gemm_rsr(
+        def _gemm_rsr(A_local: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            A_local: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer
-        ) -> None:
            """
            The inner macro that loads data from shared buffers A_shared and
            B_shared into local fragments, then issues Tensor Core mma ops,
@@ -63,18 +62,14 @@ class GemmPrimitiveMMA(GemmBaseParams):
            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
            if a_is_fragment:
                # Annotate layout for A_local if it is a fragment.
-                T.annotate_layout(
+                T.annotate_layout({
-                    {
+                    A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
-                        A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
+                })
-                    }
-                )
            if c_is_fragment:
                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout(
+                T.annotate_layout({
-                    {
+                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                        C_local: mma_emitter.make_mma_store_layout(C_local),
+                })
-                    }
-                )
            for ki in T.serial(0, (block_K // micro_size_k)):
@@ -101,7 +96,7 @@ class GemmPrimitiveMMA(GemmBaseParams):
        B: tir.Buffer,
        C: tir.Buffer,
        mma_emitter: TensorCoreIntrinEmitter,
-    )-> tir.PrimExpr:
+    ) -> tir.PrimExpr:
        raise NotImplementedError("GEMM_RSR is not implemented yet")
    def gemm_ssr(
@@ -147,9 +142,7 @@ class GemmPrimitiveMMA(GemmBaseParams):
        c_is_fragment = is_fragment(C)
        @T.macro
-        def _gemm_ssr(
+        def _gemm_ssr(A_shared: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            A_shared: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer
-        ) -> None:
            """
            The inner macro that loads data from shared buffers A_shared and
            B_shared into local fragments, then issues Tensor Core mma ops,
@@ -162,13 +155,9 @@ class GemmPrimitiveMMA(GemmBaseParams):
            if c_is_fragment:
                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout(
+                T.annotate_layout({
-                    {
+                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                        C_local: mma_emitter.make_mma_store_layout(
+                })
-                            C_local
-                        ),
-                    }
-                )
            for ki in T.serial(0, (block_K // micro_size_k)):
                # Load A into fragment

--- a/tilelang/primitives/utils.py
+++ b/tilelang/primitives/utils.py
@@ -37,6 +37,7 @@ def is_shared(buffer: Buffer, allow_dynamic: bool = True) -> bool:
        conditions.append(is_shared_dynamic(buffer))
    return any(conditions)
 def is_shared_dynamic(buffer: Buffer) -> bool:
    """
    Check if the buffer is in the dynamic shared memory scope.
@@ -75,6 +76,7 @@ def is_fragment(buffer: Buffer) -> bool:
    """
    return buffer.scope().startswith("local.fragment")
 def array_reduce(array: List[int]) -> int:
    """
    Reduce an array of integers to a single integer.