update cp_async & init inject_ds_read

3852d58b · wangziyang · 19cdf0ca · 3852d58b · 3852d58b
Commit 3852d58b authored Apr 03, 2026 by wangziyang
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 0 deletions

tilelang/tileop/gemm/gemm_mma.py tilelang/tileop/gemm/gemm_mma.py +4 -0

tilelang/transform/__init__.py tilelang/transform/__init__.py +19 -0

No files found.
--- a/tilelang/tileop/gemm/gemm_mma.py
+++ b/tilelang/tileop/gemm/gemm_mma.py
@@ -29,24 +29,28 @@ class GemmMMA(GemmBase):
            chunk=self.chunk,
        )
        if self.is_gemm_ss():
+            print("gemm_ss")
            return {
                self.A: make_swizzled_layout(self.A),
                self.B: make_swizzled_layout(self.B),
                self.C: mma_emitter.make_mma_store_layout(self.C),
            }
        elif self.is_gemm_sr():
+            print("gemm_ss")
            return {
                self.A: make_swizzled_layout(self.A),
                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
                self.C: mma_emitter.make_mma_store_layout(self.C),
            }
        elif self.is_gemm_rs():
+            print("gemm_ss")
            return {
                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
                self.B: make_swizzled_layout(self.B),
                self.C: mma_emitter.make_mma_store_layout(self.C),
            }
        elif self.is_gemm_rr():
+            print("gemm_ss")
            return {
                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),

--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -338,9 +338,28 @@ def InjectPTXAsyncCopy():
    fpass : tvm.transform.Pass
        The result pass
    """
+    print("Injecting PTX async copy for global to shared memory copy on DCU.")
    return _ffi_api.InjectPTXAsyncCopy()  # type: ignore


+def InjectDSRead():
+    """Rewrite shared memory to register load using ds_read hardware instructions on DCU.
+
+    This pass replaces BufferLoad from shared memory with ds_read_b64 or
+    ds_read_m32x16_b16 hardware instructions for AMD DCU (gfx936, gfx942, etc.).
+
+    - ds_read_b64: loads 8 bytes (4 halfs or 2 floats) at once
+    - ds_read_m32x16_b16: loads 32 bytes (16 halfs) at once
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    print("Injecting ds_read for shared to register memory copy on DCU.")
+    return _ffi_api.InjectDSRead()  # type: ignore
+
+
 def LowerDeviceStorageAccessInfo():
    """Lower attached storage access information on device.