Unverified Commit d4b6d094 authored by Lei Wang's avatar Lei Wang Committed by GitHub
Browse files

[Enhancement] Shared Memory Size Can be Dynamic (#1294)

* bugfix

* lint fix

* test

* lint fix

* increate procs

* recover
parent dd7fdb8e
......@@ -352,7 +352,7 @@ jobs:
uv run --no-project -m --
pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
)
"${PYTEST[@]}" --maxfail=3 --numprocesses=1 \
"${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
../examples
# NVIDIA CUDA tests
......
Subproject commit f4affc7f31e36e7f88c0fe1c715b03215c6a0c62
Subproject commit 713e6ade56eaa72cc85d58d9228dd9f34cc2d03e
......@@ -131,8 +131,7 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
} else {
#if CUDART_VERSION >= 11080
cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
return static_cast<T1>(
aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
#else
TL_NOT_IMPLEMENTED();
#endif
......
......@@ -374,10 +374,9 @@ def test_atomic_return_prev():
run_atomic_return_prev(32, 32, 8, 8)
# TODO(lei): test failed and this is experimental
# CC @dyq
# def test_tile_atomic_add():
# run_tile_atomic_add(8, 128, 128, 32, 32)
def test_tile_atomic_add():
run_tile_atomic_add(8, 128, 128, 32, 32)
if __name__ == "__main__":
tilelang.testing.main()
import pytest
import torch
import tilelang
import tilelang.language as T
import tilelang.testing
@tilelang.jit
def dynamic_smem_kernel():
# Symbolic length to drive dynamic shared memory allocation
length = T.symbolic("len", dtype="int32") # noqa: F821
@T.prim_func
def main(global_tensor: T.Tensor[(length,), "int32"]): # noqa: F821
# Launch a simple kernel that copies from global memory into shared memory
# using a dynamically-sized allocation. No writes back to global_tensor.
with T.Kernel(1, threads=32) as _:
buffer_shared = T.alloc_shared((length,), dtype="int32") # noqa: F821
T.copy(buffer_shared, global_tensor)
return main
def _require_cuda_tensor(shape, dtype):
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
try:
return torch.randint(0, 100, shape, dtype=dtype, device="cuda")
except RuntimeError as err:
pytest.skip(f"CUDA runtime unavailable: {err}")
def _run_and_check(kernel, n):
a = _require_cuda_tensor((n,), torch.int32)
kernel(a)
torch.cuda.synchronize()
def test_dynamic_shared_memory_varies_across_calls():
kernel = dynamic_smem_kernel()
# Run with different dynamic shared memory sizes across invocations
_run_and_check(kernel, 100)
_run_and_check(kernel, 200)
# Repeat sizes to exercise attribute caching path
_run_and_check(kernel, 200)
_run_and_check(kernel, 100)
if __name__ == "__main__":
tilelang.testing.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment