import numpy import pytest import cupy from cupyx import jit from cupy import testing from cupy.cuda import device from cupy.cuda import runtime class TestRaw: def test_raw_grid_1D(self): @jit.rawkernel() def f(arr1, arr2): x = jit.grid(1) if x < arr1.size: arr2[x] = arr1[x] x = cupy.arange(10) y = cupy.empty_like(x) f((1,), (10,), (x, y)) assert (x == y).all() def test_raw_grid_2D(self): @jit.rawkernel() def f(arr1, arr2, n, m): x, y = jit.grid(2) # TODO(leofang): make it possible to write this: # if x < arr1.shape[0] and y < arr1.shape[1]: if x < n and y < m: arr2[x, y] = arr1[x, y] x = cupy.arange(20).reshape(4, 5) y = cupy.empty_like(x) f((1,), (4, 5), (x, y, x.shape[0], x.shape[1])) assert (x == y).all() def test_raw_grid_3D(self): @jit.rawkernel() def f(arr1, arr2, k, m, n): x, y, z = jit.grid(3) if x < k and y < m and z < n: arr2[x, y, z] = arr1[x, y, z] l, m, n = (2, 3, 4) x = cupy.arange(24).reshape(l, m, n) y = cupy.empty_like(x) f(((l+1)//2, (m+1)//2, (n+1)//2), (2, 2, 2), (x, y, l, m, n)) assert (x == y).all() def test_raw_grid_invalid1(self): @jit.rawkernel() def f(): x, = jit.grid(1) # cannot unpack an int with pytest.raises(ValueError): f((1,), (1,), ()) def test_raw_grid_invalid2(self): @jit.rawkernel() def f(): x = jit.grid(2) y = cupy.int64(x) # <- x is a tuple # NOQA # we don't care the exception type as long as something is raised with pytest.raises(Exception): f((1,), (1,), ()) def test_raw_grid_invalid3(self): for n in (0, 4, 'abc', [0], (1,)): @jit.rawkernel() def f(): x = jit.grid(n) # n can only be 1, 2, 3 (as int) # NOQA err = ValueError if isinstance(n, int) else TypeError with pytest.raises(err): f((1,), (1,), ()) def test_raw_gridsize_1D(self): @jit.rawkernel() def f(arr): x = jit.grid(1) gx = jit.gridsize(1) if x == 0: arr[x] = gx x = cupy.zeros(1) grid = 8 block = 10 f((grid,), (block,), (x,)) assert x == grid * block def test_raw_gridsize_2D(self): @jit.rawkernel() def f(arr): x, y = jit.grid(2) gx, gy = jit.gridsize(2) if x == 0 and y == 0: arr[0] = gx arr[1] = gy x = cupy.zeros(2) grid = (2, 7) block = (3, 5) f(grid, block, (x,)) assert x[0] == grid[0] * block[0] assert x[1] == grid[1] * block[1] def test_raw_gridsize_3D(self): @jit.rawkernel() def f(arr): x, y, z = jit.grid(3) gx, gy, gz = jit.gridsize(3) if x == 0 and y == 0 and z == 0: arr[0] = gx arr[1] = gy arr[2] = gz x = cupy.zeros(3) grid = (2, 7, 4) block = (3, 5, 11) f(grid, block, (x,)) assert x[0] == grid[0] * block[0] assert x[1] == grid[1] * block[1] assert x[2] == grid[2] * block[2] def test_raw_one_thread(self): @jit.rawkernel() def f(x, y): y[0] = x[0] x = cupy.array([10], dtype=numpy.int32) y = cupy.array([20], dtype=numpy.int32) f((1,), (1,), (x, y)) assert int(y[0]) == 10 def test_raw_elementwise_single_op(self): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x y[tid] = x[tid] x = testing.shaped_random((30,), dtype=numpy.int32, seed=0) y = testing.shaped_random((30,), dtype=numpy.int32, seed=1) f((5,), (6,), (x, y)) assert bool((x == y).all()) def test_raw_elementwise_loop(self): @jit.rawkernel() def f(x, y, size): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x ntid = jit.blockDim.x * jit.gridDim.x for i in range(tid, size, ntid): y[i] = x[i] x = testing.shaped_random((1024,), dtype=numpy.int32, seed=0) y = testing.shaped_random((1024,), dtype=numpy.int32, seed=1) f((5,), (6,), (x, y, numpy.uint32(1024))) assert bool((x == y).all()) def test_raw_multidimensional_array(self): @jit.rawkernel() def f(x, y, n_row, n_col): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x ntid = jit.blockDim.x * jit.gridDim.x size = n_row * n_col for i in range(tid, size, ntid): i_row = i // n_col i_col = i % n_col y[i_row, i_col] = x[i_row, i_col] n, m = numpy.uint32(12), numpy.uint32(13) x = testing.shaped_random((n, m), dtype=numpy.int32, seed=0) y = testing.shaped_random((n, m), dtype=numpy.int32, seed=1) f((5,), (6,), (x, y, n, m)) assert bool((x == y).all()) def test_raw_multidimensional_array_with_attr(self): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x ntid = jit.blockDim.x * jit.gridDim.x n_col = x.size // len(x) for i in range(tid, x.size, ntid): i_row = i // n_col i_col = i % n_col y[i_row, i_col] = x[i_row, i_col] n, m = numpy.uint32(12), numpy.uint32(13) x = testing.shaped_random((n, m), dtype=numpy.int32, seed=0) y = testing.shaped_random((n, m), dtype=numpy.int32, seed=1) f((5,), (6,), (x, y)) assert bool((x == y).all()) def test_raw_ndim(self): @jit.rawkernel() def f(x, y): y[0] = x.ndim x = cupy.empty((1, 1, 1, 1, 1, 1, 1), dtype=numpy.int32) y = cupy.zeros((1,), dtype=numpy.int64) f((1,), (1,), (x, y)) assert y.item() == 7 def test_raw_0dim_array(self): @jit.rawkernel() def f(x, y): y[()] = x[()] x = testing.shaped_random((), dtype=numpy.int32, seed=0) y = testing.shaped_random((), dtype=numpy.int32, seed=1) f((1,), (1,), (x, y)) assert bool((x == y).all()) def test_min(self): @jit.rawkernel() def f(x, y, z, r): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x r[tid] = min(x[tid], y[tid], z[tid]) x = testing.shaped_random((1024,), dtype=numpy.int32, seed=0) y = testing.shaped_random((1024,), dtype=numpy.int32, seed=1) z = testing.shaped_random((1024,), dtype=numpy.int32, seed=2) r = testing.shaped_random((1024,), dtype=numpy.int32, seed=3) f((8,), (128,), (x, y, z, r)) expected = cupy.minimum(x, cupy.minimum(y, z)) assert bool((r == expected).all()) def test_max(self): @jit.rawkernel() def f(x, y, z, r): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x r[tid] = max(x[tid], y[tid], z[tid]) x = testing.shaped_random((1024,), dtype=numpy.int32, seed=0) y = testing.shaped_random((1024,), dtype=numpy.int32, seed=1) z = testing.shaped_random((1024,), dtype=numpy.int32, seed=2) r = testing.shaped_random((1024,), dtype=numpy.int32, seed=3) f((8,), (128,), (x, y, z, r)) expected = cupy.maximum(x, cupy.maximum(y, z)) assert bool((r == expected).all()) def test_syncthreads(self): @jit.rawkernel() def f(x, y, buf): tid = jit.threadIdx.x + jit.threadIdx.y * jit.blockDim.x ntid = jit.blockDim.x * jit.blockDim.y buf[tid] = x[ntid - tid - 1] jit.syncthreads() y[tid] = buf[ntid - tid - 1] x = testing.shaped_random((1024,), dtype=numpy.int32, seed=0) y = testing.shaped_random((1024,), dtype=numpy.int32, seed=1) buf = testing.shaped_random((1024,), dtype=numpy.int32, seed=2) f((1,), (32, 32), (x, y, buf)) assert bool((x == y).all()) def test_syncwarp(self): @jit.rawkernel() def f(x): laneId = jit.threadIdx.x & 0x1f if laneId < 16: x[laneId] = 1 else: x[laneId] = 2 jit.syncwarp() x = cupy.zeros((32,), dtype=numpy.int32) y = cupy.ones_like(x) f((1,), (32,), (x,)) y[16:] += 1 assert bool((x == y).all()) def test_syncwarp_mask(self): @jit.rawkernel() def f(x, m): laneId = jit.threadIdx.x & 0x1f if laneId < m: x[laneId] = 1 jit.syncwarp(mask=(1 << m) - 1) for mask in (2, 4, 8, 16, 32): x = cupy.zeros((32,), dtype=numpy.int32) y = cupy.zeros_like(x) f((1,), (32,), (x, mask)) y[:mask] += 1 assert bool((x == y).all()) def test_shared_memory_static(self): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x ntid = jit.blockDim.x bid = jit.blockIdx.x i = tid + bid * ntid smem = jit.shared_memory(numpy.int32, 32) smem[tid] = x[i] jit.syncthreads() y[i] = smem[ntid - tid - 1] x = testing.shaped_random((1024,), dtype=numpy.int32, seed=0) y = testing.shaped_random((1024,), dtype=numpy.int32, seed=1) f((32,), (32,), (x, y)) expected = x.reshape(32, 32)[:, ::-1].ravel() assert bool((y == expected).all()) def test_shared_memory_dynamic(self): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x ntid = jit.blockDim.x bid = jit.blockIdx.x i = tid + bid * ntid smem = jit.shared_memory(numpy.int32, None) smem[tid] = x[i] jit.syncthreads() y[i] = smem[ntid - tid - 1] x = testing.shaped_random((1024,), dtype=numpy.int32, seed=0) y = testing.shaped_random((1024,), dtype=numpy.int32, seed=1) f((32,), (32,), (x, y), shared_mem=128) expected = x.reshape(32, 32)[:, ::-1].ravel() assert bool((y == expected).all()) @staticmethod def _check(a, e): if a.dtype == numpy.float16: testing.assert_allclose(a, e, rtol=3e-2, atol=3e-2) else: testing.assert_allclose(a, e, rtol=1e-5, atol=1e-5) @testing.for_dtypes('iILQfd' if runtime.is_hip else 'iILQefd') def test_atomic_add(self, dtype): @jit.rawkernel() def f(x, index, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x jit.atomic_add(out, index[tid], x[tid]) x = testing.shaped_random((1024,), dtype=dtype, seed=0) index = testing.shaped_random( (1024,), dtype=numpy.bool_, seed=1).astype(numpy.int32) out = cupy.zeros((2,), dtype=dtype) f((32,), (32,), (x, index, out)) expected = cupy.zeros((2,), dtype=dtype) cupy.add.at(expected, index, x) self._check(out, expected) @testing.for_dtypes('iI') def test_atomic_sub(self, dtype): @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: jit.atomic_sub(out, 0, x[tid]) x = cupy.ones((1024,), dtype=dtype) out = cupy.sum(x, dtype=dtype).reshape(1,) f((32,), (32,), (x, out)) expected = cupy.zeros_like(out) self._check(out, expected) @testing.for_dtypes('iILQf') def test_atomic_exch(self, dtype): @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: jit.atomic_exch(out, x.size - tid - 1, x[tid]) x = cupy.arange(1024, dtype=dtype) out = cupy.zeros_like(x) f((32,), (32,), (x, out)) expected = x[::-1] self._check(out, expected) @testing.for_dtypes('iILQ') def test_atomic_min(self, dtype): @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: jit.atomic_min(out, tid, x[tid]) x = cupy.arange(1024, dtype=dtype) out = x + 1 f((32,), (32,), (x, out)) # effectively copy x to out self._check(out, x) @testing.for_dtypes('iILQ') def test_atomic_max(self, dtype): @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: jit.atomic_max(out, tid, x[tid]) x = cupy.arange(1, 1025, dtype=dtype) out = x - 1 f((32,), (32,), (x, out)) # effectively copy x to out self._check(out, x) def test_atomic_inc(self): dtype = cupy.uint32 # atomic_inc only supports 1 dtype @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: # = 0 if out[tid] >= x[tid] else out[tid] + 1 jit.atomic_inc(out, tid, x[tid]) x = cupy.arange(1, 1025, dtype=dtype) out = x - 1 f((32,), (32,), (x, out)) expected = x self._check(out, expected) def test_atomic_dec(self): dtype = cupy.uint32 # atomic_dec only supports 1 dtype @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: # = x[tid] if (out[tid] == 0 or out[tid] >= x[tid]) \ # else out[tid] - 1 jit.atomic_dec(out, tid, x[tid]) x = cupy.zeros(1024, dtype=dtype) out = cupy.arange(1024, dtype=dtype) f((32,), (32,), (x, out)) expected = x self._check(out, expected) @testing.for_dtypes('iILQ' if runtime.is_hip else 'iHILQ') def test_atomic_cas(self, dtype): if dtype == cupy.uint16: if ( runtime.is_hip or runtime.runtimeGetVersion() < 10010 or int(device.get_compute_capability()) < 70 ): self.skipTest('not supported') @jit.rawkernel() def f(x, y, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: # = y[tid] if out[tid] == x[tid] else out[tid] jit.atomic_cas(out, tid, x[tid], y[tid]) x = cupy.arange(1024, dtype=dtype) y = x.copy() y[512:] = 0 out = x.copy() out[:512] = 0 f((32,), (32,), (x, y, out)) expected = cupy.zeros_like(out) self._check(out, expected) @testing.for_dtypes('iILQ') def test_atomic_and(self, dtype): @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: jit.atomic_and(out, tid, x[tid]) x = cupy.arange(1024, dtype=dtype) out = cupy.ones_like(x) f((32,), (32,), (x, out)) expected = cupy.zeros_like(out) expected[1::2] = 1 self._check(out, expected) @testing.for_dtypes('iILQ') def test_atomic_or(self, dtype): @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: jit.atomic_or(out, tid, x[tid]) x = testing.shaped_random((1024,), dtype=dtype, seed=0) out = testing.shaped_random((1024,), dtype=dtype, seed=1) y = out.copy() f((32,), (32,), (x, out)) expected = x | y self._check(out, expected) @testing.for_dtypes('iILQ') def test_atomic_xor(self, dtype): @jit.rawkernel() def f(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: jit.atomic_xor(out, tid, x[tid]) @jit.rawkernel() def f_no_atomic(x, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: out[tid] = out[tid] ^ x[tid] x = testing.shaped_random((1024,), dtype=dtype, seed=0) out = testing.shaped_random((1024,), dtype=dtype, seed=1) expected = out.copy() f((32,), (32,), (x, out)) f_no_atomic((32,), (32,), (x, expected)) self._check(out, expected) def test_raw_grid_block_interface(self): @jit.rawkernel() def f(x, y, size): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x ntid = jit.blockDim.x * jit.gridDim.x for i in range(tid, size, ntid): y[i] = x[i] x = testing.shaped_random((1024,), dtype=numpy.int32, seed=0) y = testing.shaped_random((1024,), dtype=numpy.int32, seed=1) f[5, 6](x, y, numpy.uint32(1024)) assert bool((x == y).all()) # TODO(leofang): test float16 ('e') once cupy/cupy#5346 is resolved @testing.for_dtypes('iIlqfd' if runtime.is_hip else 'iIlLqQfd') def test_shfl(self, dtype): # strictly speaking this function is invalid in Python (see the # discussion in cupy/cupy#5340), but it serves for our purpose @jit.rawkernel() def f(a, b): laneId = jit.threadIdx.x & 0x1f if laneId == 0: value = a value = jit.shfl_sync(0xffffffff, value, 0) b[laneId] = value a = dtype(100) b = cupy.empty((32,), dtype=dtype) f[1, 32](a, b) assert (b == a * cupy.ones((32,), dtype=dtype)).all() def test_shfl_width(self): @jit.rawkernel() def f(a, b, w): laneId = jit.threadIdx.x & 0x1f value = jit.shfl_sync(0xffffffff, b[jit.threadIdx.x], 0, width=w) b[laneId] = value c = cupy.arange(32, dtype=cupy.int32) for w in (2, 4, 8, 16, 32): a = cupy.int32(100) b = cupy.arange(32, dtype=cupy.int32) f[1, 32](a, b, w) c[c % w != 0] = c[c % w == 0] assert (b == c).all() # TODO(leofang): test float16 ('e') once cupy/cupy#5346 is resolved @testing.for_dtypes('iIlqfd' if runtime.is_hip else 'iIlLqQfd') def test_shfl_up(self, dtype): N = 5 @jit.rawkernel() def f(a): value = jit.shfl_up_sync(0xffffffff, a[jit.threadIdx.x], N) a[jit.threadIdx.x] = value a = cupy.arange(32, dtype=dtype) f[1, 32](a) expected = [i for i in range(N)] + [i for i in range(32-N)] assert (a == cupy.asarray(expected, dtype=dtype)).all() # TODO(leofang): test float16 ('e') once cupy/cupy#5346 is resolved @testing.for_dtypes('iIlqfd' if runtime.is_hip else 'iIlLqQfd') def test_shfl_down(self, dtype): N = 5 # __shfl_down() on HIP does not seem to have the same behavior... block = cupy._core._get_warpsize() @jit.rawkernel() def f(a): value = jit.shfl_down_sync(0xffffffff, a[jit.threadIdx.x], N) a[jit.threadIdx.x] = value a = cupy.arange(block, dtype=dtype) f[1, block](a) expected = [i for i in range(N, block)] expected += [(block-N+i) for i in range(N)] assert (a == cupy.asarray(expected, dtype=dtype)).all() # TODO(leofang): test float16 ('e') once cupy/cupy#5346 is resolved @testing.for_dtypes('iIlqfd' if runtime.is_hip else 'iIlLqQfd') def test_shfl_xor(self, dtype): @jit.rawkernel() def f_shfl_xor(a): laneId = jit.threadIdx.x & 0x1f value = 31 - laneId i = 16 while i >= 1: value += jit.shfl_xor_sync(0xffffffff, value, i) i //= 2 a[jit.threadIdx.x] = value a = cupy.arange(32, dtype=dtype) b = a.copy() f_shfl_xor[1, 32](a) assert (a == b.sum() * cupy.ones(32, dtype=dtype)).all() def test_error_msg(self): @jit.rawkernel() def f(x): return unknown_var # NOQA x = cupy.zeros((10,), dtype=numpy.float32) with pytest.raises(NameError, match='Unbound name: unknown_var'): f((1,), (1,), (x,)) def test_laneid(self): @jit.rawkernel() def f(arr): x = jit.grid(1) if x < arr.size: arr[x] = jit.laneid() N = cupy._core._get_warpsize() x = cupy.zeros((N*2,), dtype=cupy.uint32) f((1,), (N*2,), (x,)) y = cupy.arange(N*2, dtype=cupy.uint32) % N assert (x == y).all() def test_warpsize(self): @jit.rawkernel() def f(arr): x = jit.grid(1) if x == 0: arr[0] = jit.warpsize N = cupy._core._get_warpsize() x = cupy.zeros((1,), dtype=cupy.uint32) f((1,), (1,), (x,)) assert x == N @testing.for_CF_orders() def test_shape_strides(self, order): @jit.rawkernel() def f(arr): x = jit.grid(1) if x == 0: shapes = arr.shape strides = arr.strides arr[0, 0, 0] = shapes[0] arr[0, 0, 1] = shapes[1] arr[0, 0, 2] = shapes[2] arr[0, 0, 3] = strides[0] arr[0, 0, 4] = strides[1] arr[0, 0, 5] = strides[2] a = cupy.zeros((3, 4, 6), order=order) f[1, 1](a) assert (a[0, 0, 0:3] == cupy.asarray(a.shape)).all() assert (a[0, 0, 3:6] == cupy.asarray(a.strides)).all() @testing.multi_gpu(2) def test_device_cache(self): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x y[tid] = x[tid] with device.Device(0): x = testing.shaped_random((30,), dtype=numpy.int32, seed=0) y = testing.shaped_random((30,), dtype=numpy.int32, seed=1) f((5,), (6,), (x, y)) assert bool((x == y).all()) with device.Device(1): x = testing.shaped_random((30,), dtype=numpy.int32, seed=2) y = testing.shaped_random((30,), dtype=numpy.int32, seed=3) f((5,), (6,), (x, y)) assert bool((x == y).all()) def test_cached_code(self): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x y[tid] = x[tid] assert isinstance(f.cached_codes, dict) assert len(f.cached_codes) == 0 x = testing.shaped_random((30,), dtype=numpy.int32, seed=0) y = testing.shaped_random((30,), dtype=numpy.int32, seed=1) f((5,), (6,), (x, y)) assert bool((x == y).all()) assert len(f.cached_codes) == 1 assert isinstance(f.cached_code, str) x = testing.shaped_random((30,), dtype=numpy.float32, seed=0) y = testing.shaped_random((30,), dtype=numpy.float32, seed=1) f((5,), (6,), (x, y)) assert bool((x == y).all()) assert len(f.cached_codes) == 2 x = testing.shaped_random((30,), dtype=numpy.int32, seed=0) y = testing.shaped_random((30,), dtype=numpy.int32, seed=1) f((5,), (6,), (x, y)) assert bool((x == y).all()) assert len(f.cached_codes) == 2 @pytest.mark.parametrize( 'unroll', (True, False, None, 5) ) def test_range(self, unroll): @jit.rawkernel() def f(x): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x y = x[tid] # workaround: can't use "is" here... if unroll == None: # noqa: E711 for i in jit.range(5): y += 2 else: for i in jit.range(5, unroll=unroll): y += 2 x[tid] = y x = testing.shaped_random((30,), dtype=numpy.float32, seed=0) y = x.copy() f[3, 10](x) assert (x == y + 10).all() if unroll is True: assert '#pragma unroll\n' in f.cached_code elif unroll is False: assert '#pragma unroll(1)\n' in f.cached_code elif unroll is None: assert 'unroll' not in f.cached_code elif unroll >= 0: assert f'#pragma unroll({unroll})\n' in f.cached_code @pytest.mark.parametrize('ctype', (bool, int, float, complex)) def test_scalar_args(self, ctype): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x x[tid] = y x = cupy.zeros((30), dtype=ctype) y = ctype(1) f((5,), (6,), (x, y)) testing.assert_array_equal(x, numpy.full_like(x, 1)) @testing.for_all_dtypes() def test_numpy_scalar_args(self, dtype): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x x[tid] = y x = cupy.zeros((30), dtype=dtype) y = numpy.dtype(dtype).type(1) f((5,), (6,), (x, y)) testing.assert_array_equal(x, numpy.full_like(x, 1))