Unverified Commit 667632cc authored by guchaoyang's avatar guchaoyang Committed by GitHub
Browse files

Merge branch 'main' into dcu

parents d6dd2ddf a874e4e8
"""Reproduce: shape constant/symbol mismatch on A."""
import torch
from common import build_matmul_kernel
def main():
M = N = K = 128
fn = build_matmul_kernel(M, N, K, target="cuda")
# A's second dimension is wrong (K+1 instead of K)
a = torch.empty((M, K + 1), device="cuda", dtype=torch.float16)
b = torch.empty((K, N), device="cuda", dtype=torch.float16)
fn(a, b)
if __name__ == "__main__":
main()
"""Reproduce: strides check failure (non-contiguous A via transpose)."""
import torch
from common import build_matmul_kernel
def main():
M = N = K = 128
fn = build_matmul_kernel(M, N, K, target="cuda")
a = torch.empty((M, K), device="cuda", dtype=torch.float16)
a_nc = a.t() # non-contiguous after transpose
b = torch.empty((K, N), device="cuda", dtype=torch.float16)
fn(a_nc, b)
if __name__ == "__main__":
main()
"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel."""
import torch
from common import build_matmul_kernel
def main():
M = N = K = 64
fn = build_matmul_kernel(M, N, K, target="cuda")
a = torch.empty((M, K), device="cpu", dtype=torch.float16)
b = torch.empty((K, N), device="cpu", dtype=torch.float16)
fn(a, b)
if __name__ == "__main__":
main()
"""Reproduce: device_id mismatch (requires >=2 CUDA devices)."""
import torch
from common import build_matmul_kernel
def main():
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available")
if torch.cuda.device_count() < 2:
print("[SKIP] Need at least 2 CUDA devices to reproduce device_id mismatch.")
return
M = N = K = 64
fn = build_matmul_kernel(M, N, K, target="cuda")
a = torch.empty((M, K), device="cuda:0", dtype=torch.float16)
b = torch.empty((K, N), device="cuda:1", dtype=torch.float16)
# Output device is derived by the adapter; mismatch occurs in host checks
fn(a, b)
if __name__ == "__main__":
main()
"""Reproduce: NULL data pointer (advanced).
Passing None for a tensor argument will be forwarded through the adapter. Depending on
FFI handling, this commonly triggers a pointer-type assertion (e.g., "Expect buffer <name> to be pointer or tensor")
or a host-side non-NULL pointer check.
Note: Constructing a true DLTensor with NULL data in PyTorch is not typical; this script
demonstrates passing None, which still reproduces the intended class of failure.
"""
import torch
from common import build_matmul_kernel
def main():
M = N = K = 64
fn = build_matmul_kernel(M, N, K, target="cuda")
a = None # attempt to pass a null-like pointer
b = torch.empty((K, N), device="cuda", dtype=torch.float16)
fn(a, b)
if __name__ == "__main__":
main()
"""Reproduce: scalar parameter type mismatch (int/bool)."""
from common import build_scalar_check_kernel
def main():
fn = build_scalar_check_kernel(target="cuda")
# Wrong types
fn(1.0, True) # x should be int -> Expect arg[0] to be int
fn(1, 2.5) # flag should be bool -> Expect arg[1] to be boolean
if __name__ == "__main__":
main()
# Host-Side Check Repro Scripts
This folder contains standalone scripts that deliberately trigger host-side (and adapter-side) validation errors described in `docs/compiler_internals/tensor_checks.md`. Each script can be run directly and will reproduce the corresponding error with a minimal example.
Prerequisites
- CUDA-capable environment (most scripts compile a CUDA-targeted kernel)
- Python packages: torch, tilelang
Usage
- Run any script, e.g.:
- `python 01_num_args_mismatch.py`
- `python 02_pointer_type_error.py`
- ... up to `10_scalar_type_mismatch.py`
- Or run all at once with a summary:
- `python run_all.py`
- Logs per test are saved under `logs/` as `<script>.out` / `<script>.err`.
Notes
- Scripts assume at least one CUDA device. For the device-id mismatch case (08), two GPUs are required; the script will skip with a note if only one is available.
- The adapter raises some errors before the host stub (e.g., wrong input count). The messages are aligned with the host checks as far as possible.
import tilelang
import tilelang.language as T
import torch
# @tilelang.jit(compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype=T.float16, accum_dtype=T.float32):
@T.prim_func
def main(
A: T.Tensor((M, K), dtype),
B: T.Tensor((K, N), dtype),
C: T.Tensor((M, N), dtype),
):
# Initialize Kernel Context
with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
A_shared = T.alloc_shared((block_M, block_K), dtype)
B_shared = T.alloc_shared((block_K, block_N), dtype)
C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
T.clear(C_local)
for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
T.copy(A[by * block_M, ko * block_K], A_shared)
T.copy(B[ko * block_K, bx * block_N], B_shared)
T.gemm(A_shared, B_shared, C_local)
......@@ -27,30 +24,18 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
return main
M = 1024
N = 1024
K = 1024
block_M = 128
block_N = 128
block_K = 32
func = matmul(M, N, K, block_M, block_N, block_K)
jit_kernel = tilelang.compile(
func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3 --use_fast_math --expt-relaxed-constexpr"])
def build_matmul_kernel(M=1024, N=1024, K=1024, target="cuda"):
"""Compile and return a callable kernel that takes (A, B) and returns C."""
if target.startswith("cuda") and not torch.cuda.is_available():
raise RuntimeError("CUDA is not available; cannot build CUDA kernel for host-check repros.")
prim = make_matmul_prim(M, N, K)
# out_idx=[2] means the 3rd param C is treated as output; wrapper takes (A,B)
return tilelang.compile(prim, out_idx=[2], target=target)
import torch
a = torch.randn(M, K, device="cuda", dtype=torch.float16)
b = torch.randn(K, N, device="cuda", dtype=torch.float16)
c = jit_kernel(a, b)
print(c)
ref_c = a @ b
def build_scalar_check_kernel(target="cuda"):
@T.prim_func
def scalar_check(x: T.int32, flag: T.bool()):
T.evaluate(0)
torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
print("Kernel output matches PyTorch reference.")
return tilelang.compile(scalar_check, target=target)
import sys
import subprocess
from pathlib import Path
def main():
root = Path(__file__).resolve().parent
scripts = [
"01_num_args_mismatch.py",
"02_pointer_type_error.py",
"03_ndim_mismatch.py",
"04_dtype_mismatch.py",
"05_shape_mismatch.py",
"06_strides_mismatch.py",
"07_device_type_mismatch.py",
"08_device_id_mismatch.py",
"09_null_data_pointer.py",
"10_scalar_type_mismatch.py",
]
logs_dir = root / "logs"
logs_dir.mkdir(exist_ok=True)
results = []
for name in scripts:
script_path = root / name
if not script_path.exists():
results.append((name, "MISSING", 0))
print(f"[MISSING] {name}")
continue
print(f"\n=== Running {name} ===")
proc = subprocess.run(
[sys.executable, str(script_path)],
cwd=str(root),
capture_output=True,
text=True,
)
# Save logs
(logs_dir / f"{name}.out").write_text(proc.stdout)
(logs_dir / f"{name}.err").write_text(proc.stderr)
out = (proc.stdout or "") + (proc.stderr or "")
if "[SKIP]" in out:
status = "SKIP"
elif proc.returncode != 0:
status = "PASS" # error reproduced as expected
else:
status = "FAIL" # no error observed
results.append((name, status, proc.returncode))
print(f"[{status}] {name} (rc={proc.returncode})")
# Summary
print("\n=== Summary ===")
counts = {"PASS": 0, "FAIL": 0, "SKIP": 0, "MISSING": 0}
for name, status, _ in results:
counts[status] = counts.get(status, 0) + 1
print(f"{status:7} {name}")
print("\nTotals:")
for k in ("PASS", "FAIL", "SKIP", "MISSING"):
print(f" {k:7}: {counts.get(k, 0)}")
# Exit non-zero if any FAIL
sys.exit(1 if counts.get("FAIL", 0) else 0)
if __name__ == "__main__":
main()
......@@ -37,7 +37,7 @@ OP_NAMES: Dict[int, str] = {
6: "sqrt",
7: "tanh",
8: "rsqrt",
9: "inv_sqrt"
9: "inv_sqrt",
}
# Block sizes for kernels
......@@ -49,8 +49,7 @@ TILELANG_THREADS = 128
def parse_arguments() -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Precision comparison tool for various CUDA implementations")
parser = argparse.ArgumentParser(description="Precision comparison tool for various CUDA implementations")
parser.add_argument("--n", type=int, default=1000000, help="Number of elements to test")
parser.add_argument("--low", type=float, default=-4.0, help="Lower bound for random values")
parser.add_argument("--high", type=float, default=4.0, help="Upper bound for random values")
......@@ -67,7 +66,7 @@ def initialize_cuda() -> torch.nn.Module:
return load(
name="cuda_ops",
sources=["cuda_ops.cu"],
extra_cuda_cflags=[] # No fast_math flags
extra_cuda_cflags=[], # No fast_math flags
)
......@@ -149,8 +148,7 @@ def triton_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_S
@triton.jit
def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr,
BLOCK_SIZE: tl.constexpr):
def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_SIZE: tl.constexpr):
"""LibDevice Triton kernel for unary operations."""
pid = tl.program_id(0)
block_start = pid * BLOCK_SIZE
......@@ -188,13 +186,10 @@ def make_tilelang_unary_kernel(M: int, N: int, op_id: int, use_fastmath: bool =
@T.prim_func
def tilelang_unary_kernel(
A: T.Tensor((M, N), "float32"),
B: T.Tensor((M, N), "float32"),
A: T.Tensor((M, N), T.float32),
B: T.Tensor((M, N), T.float32),
):
with T.Kernel(
T.ceildiv(N, TILELANG_BLOCK_N),
T.ceildiv(M, TILELANG_BLOCK_M),
threads=TILELANG_THREADS) as (bx, by):
with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
row = by * TILELANG_BLOCK_M + i
col = bx * TILELANG_BLOCK_N + j
......@@ -229,14 +224,11 @@ def make_tilelang_binary_kernel(M: int, N: int):
@T.prim_func
def tilelang_binary_kernel(
A: T.Tensor((M, N), "float32"),
B: T.Tensor((M, N), "float32"),
C: T.Tensor((M, N), "float32"),
A: T.Tensor((M, N), T.float32),
B: T.Tensor((M, N), T.float32),
C: T.Tensor((M, N), T.float32),
):
with T.Kernel(
T.ceildiv(N, TILELANG_BLOCK_N),
T.ceildiv(M, TILELANG_BLOCK_M),
threads=TILELANG_THREADS) as (bx, by):
with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
row = by * TILELANG_BLOCK_M + i
col = bx * TILELANG_BLOCK_N + j
......@@ -247,10 +239,7 @@ def make_tilelang_binary_kernel(M: int, N: int):
return tilelang_binary_kernel
def tilelang_op(x: torch.Tensor,
op_id: int,
y: Optional[torch.Tensor] = None,
use_fastmath: bool = False) -> torch.Tensor:
def tilelang_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None, use_fastmath: bool = False) -> torch.Tensor:
"""TileLang operation interface."""
assert x.is_cuda
......@@ -272,7 +261,8 @@ def tilelang_op(x: torch.Tensor,
target="cuda",
pass_configs={
tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
})
},
)
out = kernel(x, y)
else: # Unary operation
kernel_func = make_tilelang_unary_kernel(M, N, op_id, use_fastmath)
......@@ -282,7 +272,8 @@ def tilelang_op(x: torch.Tensor,
target="cuda",
pass_configs={
tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
})
},
)
out = kernel(x)
# Restore original shape
......@@ -293,7 +284,7 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
"""Standard Triton operation interface."""
assert x.is_cuda
out = torch.empty_like(x)
grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
if op_id == 0: # Division - binary operation
assert y is not None, "Division operation requires second operand"
......@@ -304,13 +295,11 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
return out
def triton_libdevice_op(x: torch.Tensor,
op_id: int,
y: Optional[torch.Tensor] = None) -> torch.Tensor:
def triton_libdevice_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
"""LibDevice Triton operation interface."""
assert x.is_cuda
out = torch.empty_like(x)
grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
if op_id == 0: # Division - binary operation
assert y is not None, "Division operation requires second operand"
......@@ -321,9 +310,7 @@ def triton_libdevice_op(x: torch.Tensor,
return out
def get_pytorch_reference(x: torch.Tensor,
op_id: int,
y: Optional[torch.Tensor] = None) -> torch.Tensor:
def get_pytorch_reference(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
"""Get PyTorch reference implementation for the given operation."""
if op_id == 0:
assert y is not None, "Division requires second operand"
......@@ -362,8 +349,10 @@ def summarize_error(tag: str, output: Optional[torch.Tensor], reference: torch.T
abs_err = (output_double - reference_double).abs()
rel_err = abs_err / (reference_double.abs().clamp_min(1e-30))
print(f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}")
print(
f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}"
)
# Precision comparison function
......@@ -407,9 +396,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
results[name] = None
# Print comparison header
print(
f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}"
)
print(f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}")
print("-" * 90)
# Compare all implementations against double precision reference
......@@ -427,8 +414,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
summarize_error(tag, output, ref_double)
def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
def generate_test_data(op_id: int, n: int, device: torch.device, low: float, high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Generate appropriate test data for each operation."""
if op_id == 0: # Division
x = torch.empty(n, device=device).uniform_(low, high)
......@@ -450,9 +436,7 @@ def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
def main() -> None:
"""Main execution function."""
print(
"Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang"
)
print("Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang")
print("=" * 90)
for op_id in range(len(OP_NAMES)):
......
......@@ -10,39 +10,32 @@ env["TILELANG_CLEAR_CACHE"] = "1"
def parse_output(output):
data = {}
for line in output.split('\n'):
for line in output.split("\n"):
line = line.strip()
if line.startswith('Latency:'):
match = re.search(r'Latency: ([\d.]+)', line)
data['latency'] = match.group(1) if match else 'N/A'
elif line.startswith('TFlops:'):
match = re.search(r'TFlops: ([\d.]+)', line)
data['best_tflops'] = match.group(1) if match else 'N/A'
elif line.startswith('Config:'):
data['config'] = line.split('Config: ')[-1]
elif line.startswith('Reference TFlops:'):
match = re.search(r'Reference TFlops: ([\d.]+)', line)
data['ref_tflops'] = match.group(1) if match else 'N/A'
if line.startswith("Latency:"):
match = re.search(r"Latency: ([\d.]+)", line)
data["latency"] = match.group(1) if match else "N/A"
elif line.startswith("TFlops:"):
match = re.search(r"TFlops: ([\d.]+)", line)
data["best_tflops"] = match.group(1) if match else "N/A"
elif line.startswith("Config:"):
data["config"] = line.split("Config: ")[-1]
elif line.startswith("Reference TFlops:"):
match = re.search(r"Reference TFlops: ([\d.]+)", line)
data["ref_tflops"] = match.group(1) if match else "N/A"
return data
output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
capture_output=True,
text=True,
env=env).stdout
output_v1 = subprocess.run(["./tl/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
data_v1 = parse_output(output_v1)
output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
capture_output=True,
text=True,
env=env).stdout
output_v2 = subprocess.run(["./tll/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
data_v2 = parse_output(output_v2)
table = [[
"original", data_v1['latency'], data_v1['best_tflops'], data_v1['ref_tflops'], data_v1['config']
], [
"current", data_v2['latency'], data_v2['best_tflops'], data_v2['ref_tflops'], data_v2['config']
]]
table = [
["original", data_v1["latency"], data_v1["best_tflops"], data_v1["ref_tflops"], data_v1["config"]],
["current", data_v2["latency"], data_v2["best_tflops"], data_v2["ref_tflops"], data_v2["config"]],
]
headers = ["version", "Best Latency (s)", "Best TFlops", "Reference TFlops", "Best Config"]
......
......@@ -2,4 +2,4 @@
set -euxo pipefail
# Build for local architecture
CIBW_BUILD='cp38-*' cibuildwheel .
CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
......@@ -12,9 +12,8 @@ if docker buildx version >/dev/null 2>&1; then
docker buildx use multi >/dev/null 2>&1 || true
fi
docker buildx inspect --bootstrap >/dev/null 2>&1 || true
done
export CIBW_ARCHS='x86_64 aarch64'
fi
NO_VERSION_LABEL=ON CIBW_BUILD='cp38-*' cibuildwheel .
NO_VERSION_LABEL=ON CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
......@@ -8,19 +8,20 @@ def ref_program(A, B):
def get_configs():
configs = [{
configs = [
{
"block_M": 128,
"block_N": 128,
"block_K": 64,
"num_stages": 2,
"thread_num": 256,
"enable_rasteration": True, # keep param name for backward-compat
}]
}
]
return configs
def run(M, N, K):
def kernel(
block_M=None,
block_N=None,
......@@ -29,8 +30,8 @@ def run(M, N, K):
thread_num=None,
enable_rasteration=None,
):
dtype = "float16"
accum_dtype = "float"
dtype = T.float16
accum_dtype = T.float32
@T.prim_func
def main(
......@@ -38,8 +39,7 @@ def run(M, N, K):
B: T.Tensor((N, K), dtype),
C: T.Tensor((M, N), dtype),
):
with T.Kernel(
T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
A_shared = T.alloc_shared((block_M, block_K), dtype)
B_shared = T.alloc_shared((block_N, block_K), dtype)
C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
......@@ -60,12 +60,16 @@ def run(M, N, K):
return main
autotuner = AutoTuner.from_kernel(
kernel=kernel, configs=get_configs()).set_compile_args(
autotuner = (
AutoTuner.from_kernel(kernel=kernel, configs=get_configs())
.set_compile_args(
out_idx=[-1],
target="auto",
).set_profile_args(
ref_prog=ref_program,)
)
.set_profile_args(
ref_prog=ref_program,
)
)
return autotuner.run(warmup=3, rep=20)
......
FROM quay.io/pypa/manylinux2014_x86_64 AS builder_amd64
FROM quay.io/pypa/manylinux_2_28_x86_64 AS builder_amd64
RUN yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
RUN dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
ARG CUDA_VERSION=12.1
ARG CUDA_VERSION=12.8
ENV CUDA_VERSION=${CUDA_VERSION}
FROM quay.io/pypa/manylinux_2_28_aarch64 AS builder_arm64
......
......@@ -14,7 +14,13 @@ cd examples
python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
cd ..
# Run pytest in parallel (4 workers) for all tests in the testing/python directory
# Run pytest in parallel (4 workers) for all tests in the testing/python directory.
# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
# Do NOT export it globally here, or you'll silently change the default GEMM selection
# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
cd testing/python
python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
python -m pytest -n 4 . --ignore=jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
# CuTeDSL JIT tests (isolate env + avoid xdist contention on a single GPU)
TILELANG_USE_GEMM_V1=1 python -m pytest -n 1 jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
cd ..
......@@ -27,7 +27,14 @@ classifiers = [
]
dynamic = ["version"]
dependencies = [
"apache-tvm-ffi==0.1.0",
"apache-tvm-ffi~=0.1.0",
# Extra constraint to tvm-ffi for abi issue,
# should be removed after our tvm's update.
# See discussion in tilelang#1373 and apache/tvm-ffi#307
"apache-tvm-ffi>=0.1.3",
# torch-c-dlpack-ext provides prebuilt torch extensions.
# Without it, TVM FFI may require JIT compilation on first import.
"torch-c-dlpack-ext",
"cloudpickle",
"ml-dtypes",
"numpy>=1.23.5",
......@@ -36,15 +43,25 @@ dependencies = [
"torch>=2.7; platform_system == 'Darwin'",
"tqdm>=4.62.3",
"typing-extensions>=4.10.0",
"z3-solver>=4.13.0",
]
[project.optional-dependencies]
# mldtypes should be greater than 0.5.1
# if you want to enable fp4
fp4 = ["ml-dtypes>=0.5.1"]
# if you want to enable layout inference visualization
vis = ["matplotlib"]
[build-system]
requires = ["cython>=3.0.0", "scikit-build-core"]
requires = [
"cython>=3.0.0",
"scikit-build-core",
"z3-solver>=4.13.0",
# Not for auditwheel, explicitly add patchelf for repairing libz3.so.
# See tvm's CMakeLists.txt for more information.
"patchelf>=0.17.2; platform_system == 'Linux'",
]
build-backend = "scikit_build_core.build"
[tool.scikit-build]
......@@ -104,6 +121,7 @@ tilelang = "tilelang"
# TVM
"tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
"tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
"tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
# CUTLASS
"tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
......@@ -112,10 +130,7 @@ tilelang = "tilelang"
"tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include"
"tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library"
[tool.yapf]
based_on_style = "yapf"
column_limit = 100
indent_width = 4
[tool.codespell]
ignore-words = "docs/spelling_wordlist.txt"
......@@ -128,7 +143,7 @@ skip = [
[tool.ruff]
target-version = "py39"
line-length = 100
line-length = 140
output-format = "full"
exclude = [
......@@ -136,6 +151,14 @@ exclude = [
"examples/deepseek_v32/inference",
]
[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"
docstring-code-format = false
docstring-code-line-length = "dynamic"
[tool.ruff.lint.per-file-ignores]
# Do not upgrade type hint in testing and examples.
# See https://github.com/tile-ai/tilelang/issues/1079 for more information.
......@@ -211,12 +234,10 @@ environment.PYTHONDEVMODE = "1"
environment.PYTHONUNBUFFERED = "1"
environment.PATH = "/usr/local/cuda/bin:$PATH"
environment.LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
# Pin to glibc 2.17 for x86 and 2.28 for aarch64 for now
# TODO: upgrade to manylinux_2_28 at some time
manylinux-x86_64-image = "manylinux2014" # CentOS 7
manylinux-aarch64-image = "manylinux_2_28" # AlmaLinux 8
manylinux-x86_64-image = "manylinux_2_28" # AlmaLinux 8
manylinux-aarch64-image = "manylinux_2_34" # Z3 requires
# Install CUDA runtime and stub driver library
# manylinux_2_28 uses gcc 14, which needs CUDA 12.8
# manylinux_2_28 uses gcc 14, which needs CUDA >=12.8
before-all = """
set -eux
......@@ -225,8 +246,8 @@ uname -a
case "$(uname -m)" in
"x86_64")
DEFAULT_CUDA_VERSION="12.1"
yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
DEFAULT_CUDA_VERSION="12.8"
dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
;;
"aarch64")
DEFAULT_CUDA_VERSION="12.8"
......@@ -240,9 +261,10 @@ esac
cudaver="$(echo "${CUDA_VERSION:-$DEFAULT_CUDA_VERSION}" | cut -d '.' -f-2)"
v="${cudaver//./-}"
yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}" nvidia-driver-cuda-libs
yum clean all
"""
repair-wheel-command = [
"auditwheel -v repair --exclude libtvm_ffi.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
"auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
"pipx run abi3audit --verbose --strict {wheel}",
]
......@@ -254,7 +276,8 @@ repair-wheel-command = [
[[tool.cibuildwheel.overrides]]
select = "*linux*x86_64*"
# CentOS 7 is too old to run import test. Do wheel installation test only.
test-command = [
"echo 'Wheel is installed successfully'",
# x86_64 runners in GitHub Actions have limited storage,
# pre-install torch without caching to reduce disk usage during install tilelang.
before-test = [
"pip install torch --no-cache-dir",
]
# Requirements to run local build with `--no-build-isolation` or other developments
apache-tvm-ffi~=0.1.0
apache-tvm-ffi>=0.1.3
build
cmake>=3.26
cython>=3.0.0
......@@ -10,6 +10,7 @@ scikit-build-core
setuptools>=61
torch
wheel
z3-solver>=4.13.0
auditwheel; platform_system == 'Linux'
patchelf; platform_system == 'Linux'
......
# Format and lint requirements
pre-commit
clang-format==21.1.2
clang-tidy==21.1.1
clang-format==21.1.7
clang-tidy==21.1.6
codespell[toml]==2.4.1
ruff==0.14.3
yapf==0.43.0
ruff==0.14.9
......@@ -6,3 +6,6 @@
# CUDA specific requirements
flash-attn==2.5.8
cuda-python==12.9.4
# CuTeDSL (CUTLASS Python DSL with CuTe support)
nvidia-cutlass-dsl>=4.3.1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment