Merge branch 'main' into dcu

667632cc · guchaoyang · GitHub · d6dd2ddf · a874e4e8 · 667632cc
Unverified Commit 667632cc authored Dec 22, 2025 by guchaoyang Committed by GitHub Dec 22, 2025
20 changed files
--- a/maint/host_checks/05_shape_mismatch.py
+++ b/maint/host_checks/05_shape_mismatch.py
+"""Reproduce: shape constant/symbol mismatch on A."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A's second dimension is wrong (K+1 instead of K)
+    a = torch.empty((M, K + 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
--- a/maint/host_checks/06_strides_mismatch.py
+++ b/maint/host_checks/06_strides_mismatch.py
+"""Reproduce: strides check failure (non-contiguous A via transpose)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    a_nc = a.t()  # non-contiguous after transpose
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a_nc, b)
+
+
+if __name__ == "__main__":
+    main()
--- a/maint/host_checks/07_device_type_mismatch.py
+++ b/maint/host_checks/07_device_type_mismatch.py
+"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cpu", dtype=torch.float16)
+    b = torch.empty((K, N), device="cpu", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
--- a/maint/host_checks/08_device_id_mismatch.py
+++ b/maint/host_checks/08_device_id_mismatch.py
+"""Reproduce: device_id mismatch (requires >=2 CUDA devices)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    if torch.cuda.device_count() < 2:
+        print("[SKIP] Need at least 2 CUDA devices to reproduce device_id mismatch.")
+        return
+
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda:0", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda:1", dtype=torch.float16)
+    # Output device is derived by the adapter; mismatch occurs in host checks
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
--- a/maint/host_checks/09_null_data_pointer.py
+++ b/maint/host_checks/09_null_data_pointer.py
+"""Reproduce: NULL data pointer (advanced).
+
+Passing None for a tensor argument will be forwarded through the adapter. Depending on
+FFI handling, this commonly triggers a pointer-type assertion (e.g., "Expect buffer <name> to be pointer or tensor")
+or a host-side non-NULL pointer check.
+
+Note: Constructing a true DLTensor with NULL data in PyTorch is not typical; this script
+demonstrates passing None, which still reproduces the intended class of failure.
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = None  # attempt to pass a null-like pointer
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
--- a/maint/host_checks/10_scalar_type_mismatch.py
+++ b/maint/host_checks/10_scalar_type_mismatch.py
+"""Reproduce: scalar parameter type mismatch (int/bool)."""
+
+from common import build_scalar_check_kernel
+
+
+def main():
+    fn = build_scalar_check_kernel(target="cuda")
+
+    # Wrong types
+    fn(1.0, True)  # x should be int -> Expect arg[0] to be int
+    fn(1, 2.5)  # flag should be bool -> Expect arg[1] to be boolean
+
+
+if __name__ == "__main__":
+    main()
--- a/maint/host_checks/README.md
+++ b/maint/host_checks/README.md
+# Host-Side Check Repro Scripts
+
+This folder contains standalone scripts that deliberately trigger host-side (and adapter-side) validation errors described in `docs/compiler_internals/tensor_checks.md`. Each script can be run directly and will reproduce the corresponding error with a minimal example.
+
+Prerequisites
+- CUDA-capable environment (most scripts compile a CUDA-targeted kernel)
+- Python packages: torch, tilelang
+
+Usage
+- Run any script, e.g.:
+  - `python 01_num_args_mismatch.py`
+  - `python 02_pointer_type_error.py`
+  - ... up to `10_scalar_type_mismatch.py`
+
+- Or run all at once with a summary:
+  - `python run_all.py`
+  - Logs per test are saved under `logs/` as `<script>.out` / `<script>.err`.
+
+Notes
+- Scripts assume at least one CUDA device. For the device-id mismatch case (08), two GPUs are required; the script will skip with a note if only one is available.
+- The adapter raises some errors before the host stub (e.g., wrong input count). The messages are aligned with the host checks as far as possible.
--- a/examples/compile_flags/usecase.py
+++ b/examples/compile_flags/usecase.py
 import tilelang
 import tilelang.language as T
+import torch


-# @tilelang.jit(compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype=T.float16, accum_dtype=T.float32):
    @T.prim_func
    def main(
        A: T.Tensor((M, K), dtype),
        B: T.Tensor((K, N), dtype),
        C: T.Tensor((M, N), dtype),
    ):
-        # Initialize Kernel Context
        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
            A_shared = T.alloc_shared((block_M, block_K), dtype)
            B_shared = T.alloc_shared((block_K, block_N), dtype)
            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
                T.copy(A[by * block_M, ko * block_K], A_shared)
                T.copy(B[ko * block_K, bx * block_N], B_shared)
                T.gemm(A_shared, B_shared, C_local)
@@ -27,30 +24,18 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
    return main


-M = 1024
-N = 1024
-K = 1024
-block_M = 128
-block_N = 128
-block_K = 32
-
-func = matmul(M, N, K, block_M, block_N, block_K)
-
-jit_kernel = tilelang.compile(
-    func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3 --use_fast_math --expt-relaxed-constexpr"])
+def build_matmul_kernel(M=1024, N=1024, K=1024, target="cuda"):
+    """Compile and return a callable kernel that takes (A, B) and returns C."""
+    if target.startswith("cuda") and not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available; cannot build CUDA kernel for host-check repros.")
+    prim = make_matmul_prim(M, N, K)
+    # out_idx=[2] means the 3rd param C is treated as output; wrapper takes (A,B)
+    return tilelang.compile(prim, out_idx=[2], target=target)

-import torch
-
-a = torch.randn(M, K, device="cuda", dtype=torch.float16)
-b = torch.randn(K, N, device="cuda", dtype=torch.float16)
-
-c = jit_kernel(a, b)

-print(c)
-
-ref_c = a @ b
+def build_scalar_check_kernel(target="cuda"):
+    @T.prim_func
+    def scalar_check(x: T.int32, flag: T.bool()):
+        T.evaluate(0)

-torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-print("Kernel output matches PyTorch reference.")
+    return tilelang.compile(scalar_check, target=target)
--- a/maint/host_checks/run_all.py
+++ b/maint/host_checks/run_all.py
+import sys
+import subprocess
+from pathlib import Path
+
+
+def main():
+    root = Path(__file__).resolve().parent
+    scripts = [
+        "01_num_args_mismatch.py",
+        "02_pointer_type_error.py",
+        "03_ndim_mismatch.py",
+        "04_dtype_mismatch.py",
+        "05_shape_mismatch.py",
+        "06_strides_mismatch.py",
+        "07_device_type_mismatch.py",
+        "08_device_id_mismatch.py",
+        "09_null_data_pointer.py",
+        "10_scalar_type_mismatch.py",
+    ]
+
+    logs_dir = root / "logs"
+    logs_dir.mkdir(exist_ok=True)
+
+    results = []
+    for name in scripts:
+        script_path = root / name
+        if not script_path.exists():
+            results.append((name, "MISSING", 0))
+            print(f"[MISSING] {name}")
+            continue
+
+        print(f"\n=== Running {name} ===")
+        proc = subprocess.run(
+            [sys.executable, str(script_path)],
+            cwd=str(root),
+            capture_output=True,
+            text=True,
+        )
+
+        # Save logs
+        (logs_dir / f"{name}.out").write_text(proc.stdout)
+        (logs_dir / f"{name}.err").write_text(proc.stderr)
+
+        out = (proc.stdout or "") + (proc.stderr or "")
+        if "[SKIP]" in out:
+            status = "SKIP"
+        elif proc.returncode != 0:
+            status = "PASS"  # error reproduced as expected
+        else:
+            status = "FAIL"  # no error observed
+
+        results.append((name, status, proc.returncode))
+        print(f"[{status}] {name} (rc={proc.returncode})")
+
+    # Summary
+    print("\n=== Summary ===")
+    counts = {"PASS": 0, "FAIL": 0, "SKIP": 0, "MISSING": 0}
+    for name, status, _ in results:
+        counts[status] = counts.get(status, 0) + 1
+        print(f"{status:7} {name}")
+
+    print("\nTotals:")
+    for k in ("PASS", "FAIL", "SKIP", "MISSING"):
+        print(f"  {k:7}: {counts.get(k, 0)}")
+
+    # Exit non-zero if any FAIL
+    sys.exit(1 if counts.get("FAIL", 0) else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/maint/precision/compare_ops.py
+++ b/maint/precision/compare_ops.py
@@ -37,7 +37,7 @@ OP_NAMES: Dict[int, str] = {
    6: "sqrt",
    7: "tanh",
    8: "rsqrt",
-    9: "inv_sqrt"
+    9: "inv_sqrt",
 }

 # Block sizes for kernels
@@ -49,8 +49,7 @@ TILELANG_THREADS = 128

 def parse_arguments() -> argparse.Namespace:
    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Precision comparison tool for various CUDA implementations")
+    parser = argparse.ArgumentParser(description="Precision comparison tool for various CUDA implementations")
    parser.add_argument("--n", type=int, default=1000000, help="Number of elements to test")
    parser.add_argument("--low", type=float, default=-4.0, help="Lower bound for random values")
    parser.add_argument("--high", type=float, default=4.0, help="Upper bound for random values")
@@ -67,7 +66,7 @@ def initialize_cuda() -> torch.nn.Module:
    return load(
        name="cuda_ops",
        sources=["cuda_ops.cu"],
-        extra_cuda_cflags=[]  # No fast_math flags
+        extra_cuda_cflags=[],  # No fast_math flags
    )


@@ -149,8 +148,7 @@ def triton_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_S


 @triton.jit
-def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr,
-                                  BLOCK_SIZE: tl.constexpr):
+def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_SIZE: tl.constexpr):
    """LibDevice Triton kernel for unary operations."""
    pid = tl.program_id(0)
    block_start = pid * BLOCK_SIZE
@@ -188,13 +186,10 @@ def make_tilelang_unary_kernel(M: int, N: int, op_id: int, use_fastmath: bool =

    @T.prim_func
    def tilelang_unary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
    ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
            for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                row = by * TILELANG_BLOCK_M + i
                col = bx * TILELANG_BLOCK_N + j
@@ -229,14 +224,11 @@ def make_tilelang_binary_kernel(M: int, N: int):

    @T.prim_func
    def tilelang_binary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
-            C: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
+        C: T.Tensor((M, N), T.float32),
    ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
            for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                row = by * TILELANG_BLOCK_M + i
                col = bx * TILELANG_BLOCK_N + j
@@ -247,10 +239,7 @@ def make_tilelang_binary_kernel(M: int, N: int):
    return tilelang_binary_kernel


-def tilelang_op(x: torch.Tensor,
-                op_id: int,
-                y: Optional[torch.Tensor] = None,
-                use_fastmath: bool = False) -> torch.Tensor:
+def tilelang_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None, use_fastmath: bool = False) -> torch.Tensor:
    """TileLang operation interface."""
    assert x.is_cuda

@@ -272,7 +261,8 @@ def tilelang_op(x: torch.Tensor,
            target="cuda",
            pass_configs={
                tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
        out = kernel(x, y)
    else:  # Unary operation
        kernel_func = make_tilelang_unary_kernel(M, N, op_id, use_fastmath)
@@ -282,7 +272,8 @@ def tilelang_op(x: torch.Tensor,
            target="cuda",
            pass_configs={
                tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
        out = kernel(x)

    # Restore original shape
@@ -293,7 +284,7 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
    """Standard Triton operation interface."""
    assert x.is_cuda
    out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)

    if op_id == 0:  # Division - binary operation
        assert y is not None, "Division operation requires second operand"
@@ -304,13 +295,11 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
    return out


-def triton_libdevice_op(x: torch.Tensor,
-                        op_id: int,
-                        y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def triton_libdevice_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
    """LibDevice Triton operation interface."""
    assert x.is_cuda
    out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)

    if op_id == 0:  # Division - binary operation
        assert y is not None, "Division operation requires second operand"
@@ -321,9 +310,7 @@ def triton_libdevice_op(x: torch.Tensor,
    return out


-def get_pytorch_reference(x: torch.Tensor,
-                          op_id: int,
-                          y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def get_pytorch_reference(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
    """Get PyTorch reference implementation for the given operation."""
    if op_id == 0:
        assert y is not None, "Division requires second operand"
@@ -362,8 +349,10 @@ def summarize_error(tag: str, output: Optional[torch.Tensor], reference: torch.T

    abs_err = (output_double - reference_double).abs()
    rel_err = abs_err / (reference_double.abs().clamp_min(1e-30))
-    print(f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
-          f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}")
+    print(
+        f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
+        f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}"
+    )


 # Precision comparison function
@@ -407,9 +396,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
            results[name] = None

    # Print comparison header
-    print(
-        f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}"
-    )
+    print(f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}")
    print("-" * 90)

    # Compare all implementations against double precision reference
@@ -427,8 +414,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
        summarize_error(tag, output, ref_double)


-def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
-                       high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+def generate_test_data(op_id: int, n: int, device: torch.device, low: float, high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
    """Generate appropriate test data for each operation."""
    if op_id == 0:  # Division
        x = torch.empty(n, device=device).uniform_(low, high)
@@ -450,9 +436,7 @@ def generate_test_data(op_id: int, n: int, device: torch.device, low: float,

 def main() -> None:
    """Main execution function."""
-    print(
-        "Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang"
-    )
+    print("Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang")
    print("=" * 90)

    for op_id in range(len(OP_NAMES)):

--- a/maint/scripts/ci_performance.py
+++ b/maint/scripts/ci_performance.py
@@ -10,39 +10,32 @@ env["TILELANG_CLEAR_CACHE"] = "1"

 def parse_output(output):
    data = {}
-    for line in output.split('\n'):
+    for line in output.split("\n"):
        line = line.strip()
-        if line.startswith('Latency:'):
-            match = re.search(r'Latency: ([\d.]+)', line)
-            data['latency'] = match.group(1) if match else 'N/A'
-        elif line.startswith('TFlops:'):
-            match = re.search(r'TFlops: ([\d.]+)', line)
-            data['best_tflops'] = match.group(1) if match else 'N/A'
-        elif line.startswith('Config:'):
-            data['config'] = line.split('Config: ')[-1]
-        elif line.startswith('Reference TFlops:'):
-            match = re.search(r'Reference TFlops: ([\d.]+)', line)
-            data['ref_tflops'] = match.group(1) if match else 'N/A'
+        if line.startswith("Latency:"):
+            match = re.search(r"Latency: ([\d.]+)", line)
+            data["latency"] = match.group(1) if match else "N/A"
+        elif line.startswith("TFlops:"):
+            match = re.search(r"TFlops: ([\d.]+)", line)
+            data["best_tflops"] = match.group(1) if match else "N/A"
+        elif line.startswith("Config:"):
+            data["config"] = line.split("Config: ")[-1]
+        elif line.startswith("Reference TFlops:"):
+            match = re.search(r"Reference TFlops: ([\d.]+)", line)
+            data["ref_tflops"] = match.group(1) if match else "N/A"
    return data


-output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
+output_v1 = subprocess.run(["./tl/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
 data_v1 = parse_output(output_v1)

-output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
+output_v2 = subprocess.run(["./tll/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
 data_v2 = parse_output(output_v2)

-table = [[
-    "original", data_v1['latency'], data_v1['best_tflops'], data_v1['ref_tflops'], data_v1['config']
-], [
-    "current", data_v2['latency'], data_v2['best_tflops'], data_v2['ref_tflops'], data_v2['config']
-]]
+table = [
+    ["original", data_v1["latency"], data_v1["best_tflops"], data_v1["ref_tflops"], data_v1["config"]],
+    ["current", data_v2["latency"], data_v2["best_tflops"], data_v2["ref_tflops"], data_v2["config"]],
+]

 headers = ["version", "Best Latency (s)", "Best TFlops", "Reference TFlops", "Best Config"]


--- a/maint/scripts/docker_local_distribute.sh
+++ b/maint/scripts/docker_local_distribute.sh
@@ -2,4 +2,4 @@
 set -euxo pipefail

 # Build for local architecture
-CIBW_BUILD='cp38-*' cibuildwheel .
+CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
--- a/maint/scripts/docker_pypi_distribute.sh
+++ b/maint/scripts/docker_pypi_distribute.sh
@@ -12,9 +12,8 @@ if docker buildx version >/dev/null 2>&1; then
    docker buildx use multi >/dev/null 2>&1 || true
  fi
  docker buildx inspect --bootstrap >/dev/null 2>&1 || true
-  done

  export CIBW_ARCHS='x86_64 aarch64'
 fi

-NO_VERSION_LABEL=ON CIBW_BUILD='cp38-*' cibuildwheel .
+NO_VERSION_LABEL=ON CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
--- a/maint/scripts/performance.py
+++ b/maint/scripts/performance.py
@@ -8,19 +8,20 @@ def ref_program(A, B):


 def get_configs():
-    configs = [{
+    configs = [
+        {
            "block_M": 128,
            "block_N": 128,
            "block_K": 64,
            "num_stages": 2,
            "thread_num": 256,
            "enable_rasteration": True,  # keep param name for backward-compat
-    }]
+        }
+    ]
    return configs


 def run(M, N, K):
-
    def kernel(
        block_M=None,
        block_N=None,
@@ -29,8 +30,8 @@ def run(M, N, K):
        thread_num=None,
        enable_rasteration=None,
    ):
-        dtype = "float16"
-        accum_dtype = "float"
+        dtype = T.float16
+        accum_dtype = T.float32

        @T.prim_func
        def main(
@@ -38,8 +39,7 @@ def run(M, N, K):
            B: T.Tensor((N, K), dtype),
            C: T.Tensor((M, N), dtype),
        ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                A_shared = T.alloc_shared((block_M, block_K), dtype)
                B_shared = T.alloc_shared((block_N, block_K), dtype)
                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -60,12 +60,16 @@ def run(M, N, K):

        return main

-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs()).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs())
+        .set_compile_args(
            out_idx=[-1],
            target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
+        )
+        .set_profile_args(
+            ref_prog=ref_program,
+        )
+    )
    return autotuner.run(warmup=3, rep=20)



--- a/maint/scripts/pypi.manylinux.Dockerfile
+++ b/maint/scripts/pypi.manylinux.Dockerfile
-FROM quay.io/pypa/manylinux2014_x86_64 AS builder_amd64
+FROM quay.io/pypa/manylinux_2_28_x86_64 AS builder_amd64

-RUN yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+RUN dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo

-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.8
 ENV CUDA_VERSION=${CUDA_VERSION}

 FROM quay.io/pypa/manylinux_2_28_aarch64 AS builder_arm64

--- a/maint/scripts/run_local_ci_test.sh
+++ b/maint/scripts/run_local_ci_test.sh
@@ -14,7 +14,13 @@ cd examples
 python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
 cd ..

-# Run pytest in parallel (4 workers) for all tests in the testing/python directory
+# Run pytest in parallel (4 workers) for all tests in the testing/python directory.
+# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
+# Do NOT export it globally here, or you'll silently change the default GEMM selection
+# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
 cd testing/python
-python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
+python -m pytest -n 4 . --ignore=jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
+
+# CuTeDSL JIT tests (isolate env + avoid xdist contention on a single GPU)
+TILELANG_USE_GEMM_V1=1 python -m pytest -n 1 jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
 cd ..
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,14 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "apache-tvm-ffi==0.1.0",
+    "apache-tvm-ffi~=0.1.0",
+    # Extra constraint to tvm-ffi for abi issue,
+    # should be removed after our tvm's update.
+    # See discussion in tilelang#1373 and apache/tvm-ffi#307
+    "apache-tvm-ffi>=0.1.3",
+    # torch-c-dlpack-ext provides prebuilt torch extensions.
+    # Without it, TVM FFI may require JIT compilation on first import.
+    "torch-c-dlpack-ext",
    "cloudpickle",
    "ml-dtypes",
    "numpy>=1.23.5",
@@ -36,15 +43,25 @@ dependencies = [
    "torch>=2.7; platform_system == 'Darwin'",
    "tqdm>=4.62.3",
    "typing-extensions>=4.10.0",
+    "z3-solver>=4.13.0",
 ]

 [project.optional-dependencies]
 # mldtypes should be greater than 0.5.1
 # if you want to enable fp4
 fp4 = ["ml-dtypes>=0.5.1"]
+# if you want to enable layout inference visualization
+vis = ["matplotlib"]

 [build-system]
-requires = ["cython>=3.0.0", "scikit-build-core"]
+requires = [
+    "cython>=3.0.0",
+    "scikit-build-core",
+    "z3-solver>=4.13.0",
+    # Not for auditwheel, explicitly add patchelf for repairing libz3.so.
+    # See tvm's CMakeLists.txt for more information.
+    "patchelf>=0.17.2; platform_system == 'Linux'",
+]
 build-backend = "scikit_build_core.build"

 [tool.scikit-build]
@@ -104,6 +121,7 @@ tilelang = "tilelang"
 # TVM
 "tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
 "tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
+"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
 "tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
 # CUTLASS
 "tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
@@ -112,10 +130,7 @@ tilelang = "tilelang"
 "tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include"
 "tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library"

-[tool.yapf]
-based_on_style = "yapf"
-column_limit = 100
-indent_width = 4
+ 

 [tool.codespell]
 ignore-words = "docs/spelling_wordlist.txt"
@@ -128,7 +143,7 @@ skip = [

 [tool.ruff]
 target-version = "py39"
-line-length = 100
+line-length = 140
 output-format = "full"

 exclude = [
@@ -136,6 +151,14 @@ exclude = [
    "examples/deepseek_v32/inference",
 ]

+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = false
+docstring-code-line-length = "dynamic"
+
 [tool.ruff.lint.per-file-ignores]
 # Do not upgrade type hint in testing and examples.
 # See https://github.com/tile-ai/tilelang/issues/1079 for more information.
@@ -211,12 +234,10 @@ environment.PYTHONDEVMODE = "1"
 environment.PYTHONUNBUFFERED = "1"
 environment.PATH = "/usr/local/cuda/bin:$PATH"
 environment.LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
-# Pin to glibc 2.17 for x86 and 2.28 for aarch64 for now
-# TODO: upgrade to manylinux_2_28 at some time
-manylinux-x86_64-image = "manylinux2014"   # CentOS 7
-manylinux-aarch64-image = "manylinux_2_28" # AlmaLinux 8
+manylinux-x86_64-image  = "manylinux_2_28"  # AlmaLinux 8
+manylinux-aarch64-image = "manylinux_2_34"  # Z3 requires
 # Install CUDA runtime and stub driver library
-# manylinux_2_28 uses gcc 14, which needs CUDA 12.8
+# manylinux_2_28 uses gcc 14, which needs CUDA >=12.8
 before-all = """
 set -eux

@@ -225,8 +246,8 @@ uname -a

 case "$(uname -m)" in
    "x86_64")
-        DEFAULT_CUDA_VERSION="12.1"
-        yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+        DEFAULT_CUDA_VERSION="12.8"
+        dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
        ;;
    "aarch64")
        DEFAULT_CUDA_VERSION="12.8"
@@ -240,9 +261,10 @@ esac
 cudaver="$(echo "${CUDA_VERSION:-$DEFAULT_CUDA_VERSION}" | cut -d '.' -f-2)"
 v="${cudaver//./-}"
 yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}" nvidia-driver-cuda-libs
+yum clean all
 """
 repair-wheel-command = [
-    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
+    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
    "pipx run abi3audit --verbose --strict {wheel}",
 ]

@@ -254,7 +276,8 @@ repair-wheel-command = [

 [[tool.cibuildwheel.overrides]]
 select = "*linux*x86_64*"
-# CentOS 7 is too old to run import test. Do wheel installation test only.
-test-command = [
-    "echo 'Wheel is installed successfully'",
+# x86_64 runners in GitHub Actions have limited storage,
+# pre-install torch without caching to reduce disk usage during install tilelang.
+before-test = [
+    "pip install torch --no-cache-dir",
 ]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
 # Requirements to run local build with `--no-build-isolation` or other developments

-apache-tvm-ffi~=0.1.0
+apache-tvm-ffi>=0.1.3
 build
 cmake>=3.26
 cython>=3.0.0
@@ -10,6 +10,7 @@ scikit-build-core
 setuptools>=61
 torch
 wheel
+z3-solver>=4.13.0

 auditwheel; platform_system == 'Linux'
 patchelf; platform_system == 'Linux'

--- a/requirements-lint.txt
+++ b/requirements-lint.txt
 # Format and lint requirements
 pre-commit
-clang-format==21.1.2
-clang-tidy==21.1.1
+clang-format==21.1.7
+clang-tidy==21.1.6
 codespell[toml]==2.4.1
-ruff==0.14.3
-yapf==0.43.0
+ruff==0.14.9
--- a/requirements-test-cuda.txt
+++ b/requirements-test-cuda.txt
@@ -6,3 +6,6 @@

 # CUDA specific requirements
 flash-attn==2.5.8
+cuda-python==12.9.4
+# CuTeDSL (CUTLASS Python DSL with CuTe support)
+nvidia-cutlass-dsl>=4.3.1