Unverified Commit d4b6d094 authored by Lei Wang's avatar Lei Wang Committed by GitHub
Browse files

[Enhancement] Shared Memory Size Can be Dynamic (#1294)

* bugfix

* lint fix

* test

* lint fix

* increate procs

* recover
parent dd7fdb8e
...@@ -352,7 +352,7 @@ jobs: ...@@ -352,7 +352,7 @@ jobs:
uv run --no-project -m -- uv run --no-project -m --
pytest --verbose --color=yes --durations=0 --showlocals --cache-clear pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
) )
"${PYTEST[@]}" --maxfail=3 --numprocesses=1 \ "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
../examples ../examples
# NVIDIA CUDA tests # NVIDIA CUDA tests
......
Subproject commit f4affc7f31e36e7f88c0fe1c715b03215c6a0c62 Subproject commit 713e6ade56eaa72cc85d58d9228dd9f34cc2d03e
...@@ -131,8 +131,7 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val, ...@@ -131,8 +131,7 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
} else { } else {
#if CUDART_VERSION >= 11080 #if CUDART_VERSION >= 11080
cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address); cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
return static_cast<T1>( aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
#else #else
TL_NOT_IMPLEMENTED(); TL_NOT_IMPLEMENTED();
#endif #endif
......
...@@ -374,10 +374,9 @@ def test_atomic_return_prev(): ...@@ -374,10 +374,9 @@ def test_atomic_return_prev():
run_atomic_return_prev(32, 32, 8, 8) run_atomic_return_prev(32, 32, 8, 8)
# TODO(lei): test failed and this is experimental def test_tile_atomic_add():
# CC @dyq run_tile_atomic_add(8, 128, 128, 32, 32)
# def test_tile_atomic_add():
# run_tile_atomic_add(8, 128, 128, 32, 32)
if __name__ == "__main__": if __name__ == "__main__":
tilelang.testing.main() tilelang.testing.main()
import pytest
import torch
import tilelang
import tilelang.language as T
import tilelang.testing
@tilelang.jit
def dynamic_smem_kernel():
# Symbolic length to drive dynamic shared memory allocation
length = T.symbolic("len", dtype="int32") # noqa: F821
@T.prim_func
def main(global_tensor: T.Tensor[(length,), "int32"]): # noqa: F821
# Launch a simple kernel that copies from global memory into shared memory
# using a dynamically-sized allocation. No writes back to global_tensor.
with T.Kernel(1, threads=32) as _:
buffer_shared = T.alloc_shared((length,), dtype="int32") # noqa: F821
T.copy(buffer_shared, global_tensor)
return main
def _require_cuda_tensor(shape, dtype):
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
try:
return torch.randint(0, 100, shape, dtype=dtype, device="cuda")
except RuntimeError as err:
pytest.skip(f"CUDA runtime unavailable: {err}")
def _run_and_check(kernel, n):
a = _require_cuda_tensor((n,), torch.int32)
kernel(a)
torch.cuda.synchronize()
def test_dynamic_shared_memory_varies_across_calls():
kernel = dynamic_smem_kernel()
# Run with different dynamic shared memory sizes across invocations
_run_and_check(kernel, 100)
_run_and_check(kernel, 200)
# Repeat sizes to exercise attribute caching path
_run_and_check(kernel, 200)
_run_and_check(kernel, 100)
if __name__ == "__main__":
tilelang.testing.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment