import pytest import cupy from cupy.cuda import runtime from cupyx import jit @pytest.mark.skipif(runtime.is_hip, reason="not supported on HIP") class TestCooperativeGroups: def test_thread_block_group(self): @jit.rawkernel() def test_thread_block(x): g = jit.cg.this_thread_block() if g.thread_rank() == 0: x[0] += 101 if g.thread_rank() == 1: x[1] = g.size() # test dim3 if g.thread_rank() == 2: g_idx = g.group_index() x[2], x[3], x[4] = g_idx.x, g_idx.y, g_idx.z if g.thread_rank() == 3: t_idx = g.thread_index() x[5], x[6], x[7] = t_idx.x, t_idx.y, t_idx.z if g.thread_rank() == 4: g_dim = g.group_dim() x[8], x[9], x[10] = g_dim.x, g_dim.y, g_dim.z g.sync() x = cupy.empty((16,), dtype=cupy.int64) x[:] = -1 test_thread_block[1, 32](x) assert x[0] == 100 assert x[1] == 32 assert (x[2], x[3], x[4]) == (0, 0, 0) assert (x[5], x[6], x[7]) == (3, 0, 0) assert (x[8], x[9], x[10]) == (32, 1, 1) assert (x[11:] == -1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11060 or (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() < (11, 6)), reason='not supported until CUDA 11.6') def test_thread_block_group_cu116_new_APIs(self): @jit.rawkernel() def test_thread_block(x): g = jit.cg.this_thread_block() if g.thread_rank() == 0: x[0] = g.num_threads() if g.thread_rank() == 1: d_th = g.dim_threads() x[1], x[2], x[3] = d_th.x, d_th.y, d_th.z g.sync() x = cupy.empty((16,), dtype=cupy.int64) x[:] = -1 test_thread_block[1, 32](x) assert x[0] == 32 assert (x[1], x[2], x[3]) == (32, 1, 1) assert (x[4:] == -1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11000, reason='we do not support it') @pytest.mark.skipif(runtime.deviceGetAttribute( runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_grid_group(self): @jit.rawkernel() def test_grid(x): g = jit.cg.this_grid() if g.thread_rank() == 0: x[0] = g.is_valid() if g.thread_rank() == 1: x[1] = g.size() if g.thread_rank() == 32: # on the 2nd group # Note: this is not yet possible... # x[2], x[3], x[4] == g.group_dim() g_dim = g.group_dim() x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z g.sync() # this should just work! x = cupy.empty((16,), dtype=cupy.uint64) x[:] = -1 # = 2**64-1 test_grid[2, 32](x) assert x[0] == 1 assert x[1] == 64 assert (x[2], x[3], x[4]) == (2, 1, 1) assert (x[5:] == 2**64-1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11060 or (cupy.cuda.driver._is_cuda_python() and cupy.cuda.nvrtc.getVersion() < (11, 6)), reason='not supported until CUDA 11.6') @pytest.mark.skipif(runtime.deviceGetAttribute( runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_grid_group_cu116_new_APIs(self): @jit.rawkernel() def test_grid(x): g = jit.cg.this_grid() if g.thread_rank() == 1: x[1] = g.num_threads() if g.thread_rank() == 32: g_dim = g.dim_blocks() x[2], x[3], x[4] = g_dim.x, g_dim.y, g_dim.z if g.thread_rank() == 33: # on the 2nd block x[5] = g.block_rank() if g.thread_rank() == 2: x[6] = g.num_blocks() if g.thread_rank() == 34: # on the 2nd block b_idx = g.block_index() x[7], x[8], x[9] = b_idx.x, b_idx.y, b_idx.z g.sync() # this should just work! x = cupy.empty((16,), dtype=cupy.uint64) x[:] = -1 # = 2**64-1 test_grid[2, 32](x) assert x[1] == 64 assert (x[2], x[3], x[4]) == (2, 1, 1) assert x[5] == 1 assert x[6] == 2 assert (x[7], x[8], x[9]) == (1, 0, 0) assert (x[10:] == 2**64-1).all() @pytest.mark.skipif( runtime.runtimeGetVersion() < 11000, reason='we do not support it') @pytest.mark.skipif(runtime.deviceGetAttribute( runtime.cudaDevAttrCooperativeLaunch, 0) == 0, reason='cooperative launch is not supported on device 0') def test_cg_sync(self): @jit.rawkernel() def test_sync(): b = jit.cg.this_thread_block() g = jit.cg.this_grid() jit.cg.sync(b) jit.cg.sync(g) test_sync[2, 64]() # We also skip CUDA 11.0 due to missing support of memcpy_async # and aligned_size_t... @pytest.mark.skipif( runtime.runtimeGetVersion() < 11010, reason='not supported until CUDA 11.0') @pytest.mark.parametrize( 'test_aligned', (True, False), ) def test_cg_memcpy_async_wait_for_wait(self, test_aligned): @jit.rawkernel() def test_copy(x, y): # do two batches of copies to test relevant APIs if test_aligned: smem = jit.shared_memory(cupy.int32, 32*2, alignment=16) else: smem = jit.shared_memory(cupy.int32, 32*2) g = jit.cg.this_thread_block() tid = g.thread_rank() # int32 is 4 bytes if test_aligned: # CuPy ensures x is 256B-aligned jit.cg.memcpy_async( g, smem, 0, x, 0, 4*32, aligned_size=16) jit.cg.memcpy_async( g, smem, 32, x, 32, 4*32, aligned_size=16) else: jit.cg.memcpy_async( g, smem, 0, x, 0, 4*32) jit.cg.memcpy_async( g, smem, 32, x, 32, 4*32) jit.cg.wait_prior(g, 1) if tid < 32: y[tid] = smem[tid] jit.cg.wait(g) if 32 <= tid and tid < 64: # can't do "32 <= tid < 64" yet... y[tid] = smem[tid] x = cupy.arange(64, dtype=cupy.int32) y = cupy.zeros(64, dtype=cupy.int32) test_copy[2, 64](x, y) assert (x == y).all()