Commit deb763b7 authored by root's avatar root
Browse files

clone code from github

parent 93bf084b
Pipeline #3386 canceled with stages
# distutils: language = c++
import warnings
import string
import numpy
import cupy
import cupy._core.core as core
from cupy._core._kernel import ElementwiseKernel, _get_warpsize
from cupy._core._ufuncs import elementwise_copy
from libcpp cimport vector
from cupy._core._carray cimport shape_t
from cupy._core._carray cimport strides_t
from cupy._core cimport core
from cupy._core cimport _routines_math as _math
from cupy._core cimport _routines_manipulation as _manipulation
from cupy._core.core cimport _ndarray_base
from cupy._core cimport internal
# _ndarray_base members
cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices):
cdef Py_ssize_t axis
cdef list slice_list
cdef _ndarray_base a
slice_list = _prepare_slice_list(slices)
a, adv = _view_getitem(self, slice_list)
if adv is None:
return a
axis = adv
if len(slice_list) == 1:
s = slice_list[0]
if s.dtype.kind == 'b':
return _getitem_mask_single(a, s, axis)
else:
return a.take(s, axis)
return _getitem_multiple(a, axis, slice_list)
cdef _ndarray_setitem(_ndarray_base self, slices, value):
if isinstance(value, _ndarray_base):
value = _squeeze_leading_unit_dims(value)
_scatter_op(self, slices, value, 'update')
cdef tuple _ndarray_nonzero(_ndarray_base self):
cdef int ndim
cdef _ndarray_base dst = _ndarray_argwhere(self)
ndim = self.ndim
if ndim >= 1:
return tuple([dst[:, i] for i in range(ndim)])
else:
warnings.warn(
'calling nonzero on 0d arrays is deprecated',
DeprecationWarning)
return cupy.zeros(dst.shape[0], numpy.int64),
# TODO(kataoka): Rename the function because `_ndarray_base` does not have
# `argwhere` method
cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self):
cdef Py_ssize_t count_nonzero
cdef int ndim
cdef _ndarray_base nonzero
numpy_int64 = numpy.int64
if self.size == 0:
count_nonzero = 0
else:
if self.dtype == numpy.bool_:
nonzero = self.ravel()
else:
nonzero = cupy._core.not_equal(self, 0)
nonzero = nonzero.ravel()
# Get number of True in the mask to determine the shape of the array
# after masking.
if nonzero.size <= 2 ** 31 - 1:
scan_dtype = numpy.int32
else:
scan_dtype = numpy_int64
chunk_size = 512
# TODO(anaruse): Use Optuna to automatically tune the threshold
# that determines whether "incomplete scan" is enabled or not.
# Basically, "incomplete scan" is fast when the array size is large,
# but for small arrays, it is better to use the normal method.
incomplete_scan = nonzero.size > chunk_size
scan_index = _math.scan(
nonzero, op=_math.scan_op.SCAN_SUM, dtype=scan_dtype, out=None,
incomplete=incomplete_scan, chunk_size=chunk_size)
count_nonzero = int(scan_index[-1]) # synchronize!
ndim = self._shape.size()
dst = core.ndarray((count_nonzero, ndim), dtype=numpy_int64)
if dst.size == 0:
return dst
nonzero.shape = self.shape
if incomplete_scan:
warp_size = _get_warpsize()
size = scan_index.size * chunk_size
_nonzero_kernel_incomplete_scan(chunk_size, warp_size)(
nonzero, scan_index, dst,
size=size, block_size=chunk_size)
else:
scan_index.shape = self.shape
_nonzero_kernel(nonzero, scan_index, dst)
return dst
cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out):
cdef Py_ssize_t ndim = self._shape.size()
if axis is None:
return _take(self, indices, 0, ndim, out)
elif ndim == 0:
# check axis after atleast_1d
internal._normalize_axis_index(axis, 1)
return _take(self, indices, 0, 0, out)
else:
axis = internal._normalize_axis_index(axis, ndim)
return _take(self, indices, axis, axis + 1, out)
cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode):
if mode not in ('raise', 'wrap', 'clip'):
raise ValueError('clipmode not understood')
n = self.size
if not isinstance(indices, _ndarray_base):
indices = core.array(indices)
indices = indices.ravel()
if not isinstance(values, _ndarray_base):
values = core.array(values, dtype=self.dtype)
if values.size == 0:
return
if mode == 'raise':
err = cupy.zeros((), dtype=numpy.bool_)
_put_raise_kernel(indices, values, values.size, n, self, err)
if err:
raise IndexError('invalid entry in indices array')
elif mode == 'wrap':
_put_wrap_kernel(indices, values, values.size, n, self)
elif mode == 'clip':
_put_clip_kernel(indices, values, values.size, n, self)
cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode):
a = self
n = choices.shape[0]
# broadcast `a` and `choices[i]` for all i
if a.ndim < choices.ndim - 1:
for i in range(choices.ndim - 1 - a.ndim):
a = a[None, ...]
elif a.ndim > choices.ndim - 1:
for i in range(a.ndim + 1 - choices.ndim):
choices = choices[:, None, ...]
ba, bcs = _manipulation.broadcast(a, choices).values
if out is None:
out = core.ndarray(ba.shape[1:], choices.dtype)
n_channel = numpy.prod(bcs[0].shape)
if mode == 'raise':
if not ((a < n).all() and (0 <= a).all()):
raise ValueError('invalid entry in choice array')
_choose_kernel(ba[0], bcs, n_channel, out)
elif mode == 'wrap':
ba = ba[0] % n
_choose_kernel(ba, bcs, n_channel, out)
elif mode == 'clip':
_choose_clip_kernel(ba[0], bcs, n_channel, n, out)
else:
raise ValueError('clipmode not understood')
return out
cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out):
a = self
if numpy.isscalar(condition):
raise ValueError('condition must be a 1-d array')
if not isinstance(condition, _ndarray_base):
condition = core.array(condition, dtype=int)
if condition.ndim != 1:
raise ValueError('condition must be a 1-d array')
# do not test condition.shape
res = _ndarray_nonzero(condition) # synchronize
# the `take` method/function also make the input atleast_1d
return _ndarray_take(a, res[0], axis, out)
cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2):
return _diagonal(self, offset, axis1, axis2)
# private/internal
cdef _ndarray_base _squeeze_leading_unit_dims(_ndarray_base src):
# remove leading 1s from the shape greedily.
# TODO(kataoka): compute requested ndim and do not remove too much for
# printing correct shape in error message.
cdef Py_ssize_t i
for i in range(src.ndim):
if src._shape[i] != 1:
break
else:
i = src.ndim
if i == 0:
return src
src = src.view()
# del src._shape[:i]
# del src._strides[:i]
src._shape.erase(src._shape.begin(), src._shape.begin()+i)
src._strides.erase(src._strides.begin(), src._strides.begin()+i)
return src
cpdef list _prepare_slice_list(slices):
cdef Py_ssize_t i
cdef list slice_list
cdef bint fix_empty_dtype
if isinstance(slices, tuple):
slice_list = list(slices)
else:
slice_list = [slices]
# Convert list/NumPy/CUDA-Array-Interface arrays to cupy.ndarray.
# - Scalar int in indices returns a view.
# - Other array-like (including ()-shaped array) in indices forces to
# return a new array.
for i, s in enumerate(slice_list):
if s is None or s is Ellipsis or isinstance(s, (slice, _ndarray_base)):
continue
fix_empty_dtype = False
if isinstance(s, (list, tuple)):
# This condition looks inaccurate, but so is NumPy.
# a[1, [np.empty(0, float)]] is allowed, while
# a[1, np.empty((1, 0), float)] raises IndexError.
fix_empty_dtype = True
elif numpy.isscalar(s):
if not isinstance(s, (bool, numpy.bool_)):
# keep scalar int
continue
if cupy.min_scalar_type(s).char == 'O':
raise IndexError(
'arrays used as indices must be of integer (or boolean) type')
try:
s = core.array(s, dtype=None, copy=False)
except ValueError:
# "Unsupported dtype"
raise IndexError(
'only integers, slices (`:`), ellipsis (`...`),'
'numpy.newaxis (`None`) and integer or '
'boolean arrays are valid indices')
if fix_empty_dtype and s.size == 0:
# An empty list means empty indices, not empty mask.
# Fix default dtype (float64).
s = s.astype(numpy.int32)
slice_list[i] = s
return slice_list
cdef tuple _view_getitem(_ndarray_base a, list slice_list):
# Process scalar/slice/ellipsis indices
# Returns a 2-tuple
# - [0] (ndarray): view of a
# - [1] (int or None): start axis for remaining indices
# slice_list will be overwritten.
# input should contain:
# None, Ellipsis, slice (start:stop:step), scalar int, or
# cupy.ndarray
# output will contain:
# cupy.ndarray
cdef shape_t shape
cdef strides_t strides
cdef _ndarray_base v
cdef Py_ssize_t ndim_a, axis_a, ndim_v, axis_v, ndim_ellipsis
cdef Py_ssize_t i, k, offset
cdef Py_ssize_t s_start, s_stop, s_step, dim, ind
cdef slice ss
cdef list index_list, axes
cdef vector.vector[bint] array_like_flags
cdef vector.vector[Py_ssize_t] array_ndims
cdef bint has_ellipsis, flag
cdef char kind
axis_a = 0
has_ellipsis = False
for s in slice_list:
if s is None:
continue
elif s is Ellipsis:
if has_ellipsis:
raise IndexError(
"an index can only have a single ellipsis ('...')")
has_ellipsis = True
elif isinstance(s, _ndarray_base):
kind = ord(s.dtype.kind)
if kind == b'b':
k = s.ndim
elif kind == b'i' or kind == b'u':
k = 1
else:
raise IndexError(
'arrays used as indices must be of integer or boolean '
'type. (actual: {})'.format(s.dtype.type))
array_ndims.push_back(k)
axis_a += k
else:
# isinstance(s, slice) or numpy.isscalar(s)
axis_a += 1
if not has_ellipsis:
slice_list.append(Ellipsis)
ndim_a = a._shape.size()
if axis_a > ndim_a:
raise IndexError(
'too many indices for array: '
f'array is {ndim_a}-dimensional, but {axis_a} were indexed')
ndim_ellipsis = ndim_a - axis_a
# Create new shape and stride
i = 0
axis_a = 0
axis_v = 0
offset = 0
# index_list: remaining indices to be processed.
# Each elem is a 3-tuple (array, axis_start, axis_count)
index_list = []
for s in slice_list:
if s is None:
shape.push_back(1)
strides.push_back(0)
axis_v += 1
array_like_flags.push_back(False)
elif isinstance(s, _ndarray_base):
k = array_ndims[i]
index_list.append((s, axis_v, k))
i += 1
kind = ord(s.dtype.kind)
if kind == b'b':
_check_mask_shape(a, s, axis_a)
for _ in range(k):
shape.push_back(a._shape[axis_a])
strides.push_back(a._strides[axis_a])
axis_a += 1
axis_v += k
array_like_flags.push_back(True)
elif s is Ellipsis:
for _ in range(ndim_ellipsis):
shape.push_back(a._shape[axis_a])
strides.push_back(a._strides[axis_a])
axis_a += 1
axis_v += ndim_ellipsis
array_like_flags.push_back(False)
elif isinstance(s, slice):
ss = internal.complete_slice(s, a._shape[axis_a])
s_start = ss.start
s_stop = ss.stop
s_step = ss.step
if s_step > 0:
dim = (s_stop - s_start - 1) // s_step + 1
else:
dim = (s_stop - s_start + 1) // s_step + 1
if dim == 0:
strides.push_back(a._strides[axis_a])
else:
strides.push_back(a._strides[axis_a] * s_step)
if s_start > 0:
offset += a._strides[axis_a] * s_start
shape.push_back(dim)
axis_a += 1
axis_v += 1
array_like_flags.push_back(False)
else:
# numpy.isscalar(s)
ind = int(s)
if ind < 0:
ind += a._shape[axis_a]
if not (0 <= ind < a._shape[axis_a]):
msg = ('Index %s is out of bounds for axis %s with '
'size %s' % (s, axis_a, a._shape[axis_a]))
raise IndexError(msg)
offset += ind * a._strides[axis_a]
axis_a += 1
# array-like but not array
array_like_flags.push_back(True)
ndim_v = axis_v
v = a.view()
if a.size != 0:
v.data = a.data + offset
v._set_shape_and_strides(shape, strides, True, True)
if array_ndims.empty():
# no advanced indexing. no mask.
del slice_list[:]
return v, None
slice_list[:] = [s for s, _, _ in index_list]
# non-consecutive array-like indices => batch dims go first in output
# consecutive array-like indices => start batch dims there
k = 0
for i, flag in enumerate(array_like_flags):
if k == 0:
if flag:
k = 1
elif k == 1:
if not flag:
k = 2
else: # k == 2
if flag:
break
else:
return v, index_list[0][1]
# compute transpose arg
axes = []
for _, axis_v, k in index_list:
for _ in range(k):
axes.append(axis_v)
axis_v += 1
axes.extend([dim for dim in range(ndim_v) if dim not in axes])
v = _manipulation._transpose(v, axes)
return v, 0
@cupy._util.memoize(for_each_device=True)
def _nonzero_kernel_incomplete_scan(block_size, warp_size=32):
in_params = 'raw T a, raw S b'
out_params = 'raw O dst'
loop_prep = string.Template("""
__shared__ S smem[${warp_size}];
const int n_warp = ${block_size} / ${warp_size};
const int warp_id = threadIdx.x / ${warp_size};
const int lane_id = threadIdx.x % ${warp_size};
""").substitute(block_size=block_size, warp_size=warp_size)
loop_body = string.Template("""
S x = 0;
if (i < a.size()) x = a[i];
for (int j = 1; j < ${warp_size}; j *= 2) {
S tmp = __shfl_up_sync(0xffffffff, x, j, ${warp_size});
if (lane_id - j >= 0) x += tmp;
}
if (lane_id == ${warp_size} - 1) smem[warp_id] = x;
__syncthreads();
if (warp_id == 0) {
S y = 0;
if (lane_id < n_warp) y = smem[lane_id];
for (int j = 1; j < n_warp; j *= 2) {
S tmp = __shfl_up_sync(0xffffffff, y, j, ${warp_size});
if (lane_id - j >= 0) y += tmp;
}
int block_id = i / ${block_size};
S base = 0;
if (block_id > 0) base = b[block_id - 1];
if (lane_id == ${warp_size} - 1) y = 0;
smem[(lane_id + 1) % ${warp_size}] = y + base;
}
__syncthreads();
x += smem[warp_id];
S x0 = __shfl_up_sync(0xffffffff, x, 1, ${warp_size});
if (lane_id == 0) {
x0 = smem[warp_id];
}
if (x0 < x && i < a.size()) {
O j = i;
for (int d = a.ndim - 1; d >= 0; d--) {
ptrdiff_t ind[] = {x0, d};
O j_next = j / a.shape()[d];
dst[ind] = j - j_next * a.shape()[d];
j = j_next;
}
}
""").substitute(block_size=block_size, warp_size=warp_size)
return cupy.ElementwiseKernel(in_params, out_params, loop_body,
'cupy_nonzero_kernel_incomplete_scan',
loop_prep=loop_prep)
_nonzero_kernel = ElementwiseKernel(
'T src, S index', 'raw U dst',
'''
if (src != 0){
for(int j = 0; j < _ind.ndim; j++){
ptrdiff_t ind[] = {index - 1, j};
dst[ind] = _ind.get()[j];
}
}''',
'cupy_nonzero_kernel',
reduce_dims=False)
_take_kernel_core = '''
ptrdiff_t out_i = indices % index_range;
if (out_i < 0) out_i += index_range;
if (ldim != 1) out_i += (i / (cdim * rdim)) * index_range;
if (rdim != 1) out_i = out_i * rdim + i % rdim;
out = a[out_i];
'''
_take_kernel = ElementwiseKernel(
'raw T a, S indices, uint32 ldim, uint32 cdim, uint32 rdim, '
'int64 index_range',
'T out', _take_kernel_core, 'cupy_take')
_take_kernel_scalar = ElementwiseKernel(
'raw T a, int64 indices, uint32 ldim, uint32 cdim, uint32 rdim, '
'int64 index_range',
'T out', _take_kernel_core, 'cupy_take_scalar')
_choose_kernel = ElementwiseKernel(
'S a, raw T choices, int32 n_channel',
'T y',
'y = choices[i + n_channel * a]',
'cupy_choose')
_choose_clip_kernel = ElementwiseKernel(
'S a, raw T choices, int32 n_channel, int32 n',
'T y',
'''
S x = a;
if (a < 0) {
x = 0;
} else if (a >= n) {
x = n - 1;
}
y = choices[i + n_channel * x];
''',
'cupy_choose_clip')
cdef _put_raise_kernel = ElementwiseKernel(
'S ind, raw T vals, int64 n_vals, int64 n',
'raw U data, raw bool err',
'''
ptrdiff_t ind_ = ind;
if (!(-n <= ind_ && ind_ < n)) {
err[0] = 1;
} else {
if (ind_ < 0) ind_ += n;
data[ind_] = (U)(vals[i % n_vals]);
}
''',
'cupy_put_raise')
cdef _put_wrap_kernel = ElementwiseKernel(
'S ind, raw T vals, int64 n_vals, int64 n',
'raw U data',
'''
ptrdiff_t ind_ = ind;
ind_ %= n;
if (ind_ < 0) ind_ += n;
data[ind_] = (U)(vals[i % n_vals]);
''',
'cupy_put_wrap')
cdef _put_clip_kernel = ElementwiseKernel(
'S ind, raw T vals, int64 n_vals, int64 n',
'raw U data',
'''
ptrdiff_t ind_ = ind;
if (ind_ < 0) {
ind_ = 0;
} else if (ind_ >= n) {
ind_ = n - 1;
}
data[ind_] = (U)(vals[i % n_vals]);
''',
'cupy_put_clip')
cdef _create_scatter_kernel(name, code):
return ElementwiseKernel(
'T v, S indices, int32 cdim, int32 rdim, int32 adim',
'raw T a',
string.Template('''
S wrap_indices = indices % adim;
if (wrap_indices < 0) wrap_indices += adim;
ptrdiff_t li = i / (rdim * cdim);
ptrdiff_t ri = i % rdim;
T &out0 = a[(li * adim + wrap_indices) * rdim + ri];
T &in0 = out0;
const T &in1 = v;
${code};
''').substitute(code=code),
name,
)
cdef _scatter_update_kernel = _create_scatter_kernel(
'cupy_scatter_update', 'out0 = in1')
cdef _scatter_add_kernel = _create_scatter_kernel(
'cupy_scatter_add', 'atomicAdd(&out0, in1)')
cdef _scatter_sub_kernel = _create_scatter_kernel(
'cupy_scatter_sub', 'atomicSub(&out0, in1)')
cdef _scatter_max_kernel = _create_scatter_kernel(
'cupy_scatter_max', 'atomicMax(&out0, in1)')
cdef _scatter_min_kernel = _create_scatter_kernel(
'cupy_scatter_min', 'atomicMin(&out0, in1)')
cdef _scatter_and_kernel = _create_scatter_kernel(
'cupy_scatter_and', 'atomicAnd(&out0, in1)')
cdef _scatter_or_kernel = _create_scatter_kernel(
'cupy_scatter_or', 'atomicOr(&out0, in1)')
cdef _scatter_xor_kernel = _create_scatter_kernel(
'cupy_scatter_xor', 'atomicXor(&out0, in1)')
cdef _create_scatter_mask_kernel(name, code):
return ElementwiseKernel(
'raw T v, bool mask, S mask_scanned',
'T a',
string.Template('''
T &out0 = a;
T &in0 = a;
const T &in1 = v[mask_scanned - 1];
if (mask) ${code};
''').substitute(code=code),
name,
)
cdef _scatter_update_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_update_mask', 'out0 = in1')
cdef _scatter_add_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_add_mask', 'out0 = in0 + in1')
cdef _scatter_sub_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_add_mask', 'out0 = in0 - in1')
cdef _scatter_max_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_max_mask', 'out0 = max(in0, in1)')
cdef _scatter_min_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_min_mask', 'out0 = min(in0, in1)')
cdef _scatter_and_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_and_mask', 'out0 = (in0 & in1)')
cdef _scatter_or_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_or_mask', 'out0 = (in0 | in1)')
cdef _scatter_xor_mask_kernel = _create_scatter_mask_kernel(
'cupy_scatter_xor_mask', 'out0 = (in0 ^ in1)')
_getitem_mask_kernel = ElementwiseKernel(
'T a, bool mask, S mask_scanned',
'raw T out',
'if (mask) out[mask_scanned - 1] = a',
'cupy_getitem_mask')
cdef _check_mask_shape(_ndarray_base a, _ndarray_base mask, Py_ssize_t axis):
cdef Py_ssize_t i, a_sh, m_sh
for i, m_sh in enumerate(mask._shape):
a_sh = a._shape[axis + i]
if m_sh not in (0, a_sh):
raise IndexError(
'boolean index did not match indexed array along dimension '
f'{axis + i}; dimension is {a_sh} '
f'but corresponding boolean dimension is {m_sh}'
)
cpdef _prepare_mask_indexing_single(
_ndarray_base a, _ndarray_base mask, Py_ssize_t axis):
cdef _ndarray_base mask_scanned, mask_br
cdef int n_true
cdef tuple lshape, rshape, a_shape
cdef Py_ssize_t a_ndim, mask_ndim
a_ndim = a._shape.size()
mask_ndim = mask._shape.size()
a_shape = a.shape
lshape = a_shape[:axis]
rshape = a_shape[axis + mask._shape.size():]
if mask.size == 0:
masked_shape = lshape + (0,) + rshape
mask_br = _manipulation._reshape(mask, masked_shape)
return mask_br, mask_br, masked_shape
# Get number of True in the mask to determine the shape of the array
# after masking.
if mask.size <= 2 ** 31 - 1:
mask_type = numpy.int32
else:
mask_type = numpy.int64
op = _math.scan_op.SCAN_SUM
# starts with 1
mask_scanned = _math.scan(mask.ravel(), op=op, dtype=mask_type)
n_true = int(mask_scanned[-1])
masked_shape = lshape + (n_true,) + rshape
# When mask covers the entire array, broadcasting is not necessary.
if mask_ndim == a_ndim and axis == 0:
return (
mask,
_manipulation._reshape(mask_scanned, mask._shape),
masked_shape)
mask_scanned = None
# The scan of the broadcasted array is used to index on kernel.
mask = _manipulation._reshape(
mask,
axis * (1,) + mask.shape + (a_ndim - axis - mask_ndim) * (1,))
if <Py_ssize_t>mask._shape.size() > a_ndim:
raise IndexError('too many indices for array')
mask = _manipulation.broadcast_to(mask, a_shape)
if mask.size <= 2 ** 31 - 1:
mask_type = numpy.int32
else:
mask_type = numpy.int64
mask_scanned = _manipulation._reshape(
_math.scan(mask.ravel(), op=_math.scan_op.SCAN_SUM, dtype=mask_type),
mask._shape)
return mask, mask_scanned, masked_shape
cpdef _ndarray_base _getitem_mask_single(
_ndarray_base a, _ndarray_base mask, int axis):
cdef _ndarray_base mask_scanned
cdef tuple masked_shape
mask, mask_scanned, masked_shape = _prepare_mask_indexing_single(
a, mask, axis)
out = core.ndarray(masked_shape, dtype=a.dtype)
if out.size == 0:
return out
return _getitem_mask_kernel(a, mask, mask_scanned, out)
cdef _ndarray_base _take(
_ndarray_base a, indices, int start, int stop, _ndarray_base out=None):
# Take along (flattened) axes from start to stop.
# When start + 1 == stop this function behaves similarly to np.take
cdef tuple out_shape, indices_shape
cdef int i, ndim = a._shape.size()
cdef Py_ssize_t ldim, cdim, rdim, index_range
assert start <= stop
if numpy.isscalar(indices):
indices_shape = ()
cdim = 1
else:
if not isinstance(indices, _ndarray_base):
indices = core.array(indices, dtype=int)
indices_shape = indices.shape
cdim = indices.size
ldim = rdim = 1
if start == 0 and stop == ndim:
out_shape = indices_shape
index_range = a.size
else:
a_shape = a.shape
out_shape = a_shape[:start] + indices_shape + a_shape[stop:]
if len(indices_shape) != 0:
indices = _manipulation._reshape(
indices,
(1,) * start + indices_shape + (1,) * (ndim - stop))
for i in range(start):
ldim *= a._shape[i]
for i in range(stop, ndim):
rdim *= a._shape[i]
index_range = 1
for i in range(start, stop):
index_range *= a._shape[i]
if out is None:
out = core.ndarray(out_shape, dtype=a.dtype)
else:
if out.dtype != a.dtype:
raise TypeError('Output dtype mismatch')
if out.shape != out_shape:
raise ValueError('Output shape mismatch')
if a.size == 0 and out.size != 0:
raise IndexError('cannot do a non-empty take from an empty axes.')
if isinstance(indices, _ndarray_base):
return _take_kernel(
a.reduced_view(), indices, ldim, cdim, rdim, index_range, out)
else:
return _take_kernel_scalar(
a.reduced_view(), indices, ldim, cdim, rdim, index_range, out)
cdef _scatter_op_single(
_ndarray_base a, _ndarray_base indices, value, Py_ssize_t start,
Py_ssize_t stop, op=''):
# When op == 'update', this function behaves similarly to
# a code below using NumPy under the condition that a = a._reshape(shape)
# does not invoke copy.
#
# shape = a[:start] +\
# (numpy.prod(a[start:stop]),) + a[stop:]
# a = a._reshape(shape)
# slices = (slice(None),) * start + indices +\
# (slice(None),) * (a.ndim - stop)
# a[slices] = value
cdef Py_ssize_t adim, cdim, rdim
cdef tuple a_shape, indices_shape, lshape, rshape, v_shape
cdef _ndarray_base v
if not isinstance(value, _ndarray_base):
v = core.array(value, dtype=a.dtype)
else:
v = value.astype(a.dtype, copy=False)
a_shape = a.shape
lshape = a_shape[:start]
rshape = a_shape[stop:]
adim = internal.prod_sequence(a_shape[start:stop])
indices_shape = indices.shape
v_shape = lshape + indices_shape + rshape
v = _manipulation.broadcast_to(v, v_shape)
cdim = indices.size
rdim = internal.prod_sequence(rshape)
indices = _manipulation._reshape(
indices,
(1,) * len(lshape) + indices_shape + (1,) * len(rshape))
indices = _manipulation.broadcast_to(indices, v_shape)
if op == 'update':
_scatter_update_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
elif op == 'add':
# There is constraints on types because atomicAdd() in CUDA 7.5
# only supports int32, uint32, uint64, and float32.
if not issubclass(v.dtype.type,
(numpy.int32, numpy.float16, numpy.float32,
numpy.float64, numpy.uint32, numpy.uint64,
numpy.intc, numpy.uintc, numpy.ulonglong)):
raise TypeError(
'cupy.add.at only supports int32, float16, float32, float64, '
'uint32, uint64, as data type')
_scatter_add_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
elif op == 'sub':
if not issubclass(v.dtype.type,
(numpy.int32, numpy.uint32,
numpy.intc, numpy.uintc)):
raise TypeError(
'cupy.subtract.at only supports int32, uint32, as data type')
_scatter_sub_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
elif op == 'max':
if not issubclass(v.dtype.type,
(numpy.int32, numpy.float32, numpy.float64,
numpy.uint32, numpy.uint64,
numpy.intc, numpy.uintc, numpy.ulonglong)):
raise TypeError(
'cupy.maximum.at only supports int32, float32, float64, '
'uint32, uint64 as data type')
_scatter_max_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
elif op == 'min':
if not issubclass(v.dtype.type,
(numpy.int32, numpy.float32, numpy.float64,
numpy.uint32, numpy.uint64,
numpy.intc, numpy.uintc, numpy.ulonglong)):
raise TypeError(
'cupy.minimum.at only supports int32, float32, float64, '
'uint32, uint64 as data type')
_scatter_min_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
elif op == 'and':
if not issubclass(v.dtype.type,
(numpy.int32, numpy.int64,
numpy.uint32, numpy.uint64,
numpy.intc, numpy.uintc,
numpy.longlong, numpy.ulonglong)):
raise TypeError(
'cupy.bitwise_and.at only supports int32, int64, '
'uint32, uint64 as data type')
_scatter_and_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
elif op == 'or':
if not issubclass(v.dtype.type,
(numpy.int32, numpy.int64,
numpy.uint32, numpy.uint64,
numpy.intc, numpy.uintc,
numpy.longlong, numpy.ulonglong)):
raise TypeError(
'cupy.bitwise_or.at only supports int32, int64, '
'uint32, uint64 as data type')
_scatter_or_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
elif op == 'xor':
if not issubclass(v.dtype.type,
(numpy.int32, numpy.int64,
numpy.uint32, numpy.uint64,
numpy.intc, numpy.uintc,
numpy.longlong, numpy.ulonglong)):
raise TypeError(
'cupy.bitwise_xor.at only supports int32, int64, '
'uint32, uint64 as data type')
_scatter_xor_kernel(
v, indices, cdim, rdim, adim, a.reduced_view())
else:
raise ValueError('provided op is not supported')
cdef _scatter_op_mask_single(
_ndarray_base a, _ndarray_base mask, v, Py_ssize_t axis, op):
cdef _ndarray_base mask_scanned, src
cdef tuple masked_shape
mask, mask_scanned, masked_shape = _prepare_mask_indexing_single(
a, mask, axis)
if internal.prod(masked_shape) == 0:
return
if not isinstance(v, _ndarray_base):
src = core.array(v, dtype=a.dtype)
else:
src = v
# Cython's static resolution does not work because of omitted arguments
src = (<object>src).astype(a.dtype, copy=False)
# broadcast src to shape determined by the mask
src = _manipulation.broadcast_to(src, masked_shape)
if op == 'update':
_scatter_update_mask_kernel(src, mask, mask_scanned, a)
elif op == 'add':
_scatter_add_mask_kernel(src, mask, mask_scanned, a)
elif op == 'sub':
_scatter_sub_mask_kernel(src, mask, mask_scanned, a)
elif op == 'max':
_scatter_max_mask_kernel(src, mask, mask_scanned, a)
elif op == 'min':
_scatter_min_mask_kernel(src, mask, mask_scanned, a)
elif op == 'and':
_scatter_and_mask_kernel(src, mask, mask_scanned, a)
elif op == 'or':
_scatter_or_mask_kernel(src, mask, mask_scanned, a)
elif op == 'xor':
_scatter_xor_mask_kernel(src, mask, mask_scanned, a)
else:
raise ValueError('provided op is not supported')
cdef _scatter_op(_ndarray_base a, slices, value, op):
cdef Py_ssize_t start, stop, axis
cdef _ndarray_base x, y, reduced_idx
cdef list slice_list
slice_list = _prepare_slice_list(slices)
a, adv = _view_getitem(a, slice_list)
if adv is not None:
axis = adv
if len(slice_list) == 1:
s = slice_list[0]
if s.dtype.kind == 'b':
_scatter_op_mask_single(a, s, value, axis, op)
else:
_scatter_op_single(a, s, value, axis, axis + 1, op)
else:
# scatter_op with multiple integer arrays
reduced_idx, start, stop = _prepare_multiple_array_indexing(
a, axis, slice_list)
_scatter_op_single(a, reduced_idx, value, start, stop, op)
return
y = a
if op == 'update':
if not isinstance(value, _ndarray_base):
y.fill(value)
return
x = value
if (internal.vector_equal(y._shape, x._shape) and
internal.vector_equal(y._strides, x._strides)):
if y.data.ptr == x.data.ptr:
return # Skip since x and y are the same array
elif y._c_contiguous and x.dtype == y.dtype:
y.data.copy_from_device_async(x.data, x.nbytes)
return
elementwise_copy(x, y)
return
if op == 'add':
_math._add(y, value, y)
return
if op == 'sub':
_math._subtract(y, value, y)
return
if op == 'max':
cupy.maximum(y, value, y)
return
if op == 'min':
cupy.minimum(y, value, y)
return
if op == 'and':
cupy.bitwise_and(y, value, y)
return
if op == 'or':
cupy.bitwise_or(y, value, y)
return
if op == 'xor':
cupy.bitwise_xor(y, value, y)
return
raise ValueError('this op is not supported')
cdef _ndarray_base _diagonal(
_ndarray_base a, Py_ssize_t offset=0, Py_ssize_t axis1=0,
Py_ssize_t axis2=1):
cdef Py_ssize_t ndim = a.ndim
if not (-ndim <= axis1 < ndim and -ndim <= axis2 < ndim):
raise numpy.AxisError(
'axis1(={0}) and axis2(={1}) must be within range '
'(ndim={2})'.format(axis1, axis2, ndim))
axis1 %= ndim
axis2 %= ndim
if axis1 < axis2:
min_axis, max_axis = axis1, axis2
else:
min_axis, max_axis = axis2, axis1
tr = list(range(ndim))
del tr[max_axis]
del tr[min_axis]
if offset >= 0:
a = _manipulation._transpose(a, tr + [axis1, axis2])
else:
a = _manipulation._transpose(a, tr + [axis2, axis1])
offset = -offset
diag_size = max(0, min(a.shape[-2], a.shape[-1] - offset))
ret_shape = a.shape[:-2] + (diag_size,)
if diag_size == 0:
return core.ndarray(ret_shape, dtype=a.dtype)
a = a[..., :diag_size, offset:offset + diag_size]
ret = a.view()
# TODO(niboshi): Confirm update_x_contiguity flags
ret._set_shape_and_strides(
a.shape[:-2] + (diag_size,),
a.strides[:-2] + (a.strides[-1] + a.strides[-2],),
True, True)
return ret
_prepare_array_indexing = ElementwiseKernel(
'T s, S len, S stride',
'S out',
'S in0 = s, in1 = len;'
'out += stride * (in0 - _floor_divide(in0, in1) * in1)',
'cupy_prepare_array_indexing')
cdef tuple _prepare_multiple_array_indexing(
_ndarray_base a, Py_ssize_t start, list slices
):
# slices consist of ndarray
cdef list indices = [], shapes = [] # int ndarrays
cdef Py_ssize_t i, stop, stride
cdef _ndarray_base reduced_idx, s
for s in slices:
if s.dtype.kind == 'b':
s = _ndarray_argwhere(s).T
indices.extend(s)
shapes.append(s.shape[1:])
else:
indices.append(s)
shapes.append(s.shape)
stop = start + len(indices)
# br = _manipulation.broadcast(*indices)
# indices = list(br.values)
reduced_idx = core.ndarray(
internal._broadcast_shapes(shapes), dtype=numpy.int64)
reduced_idx.fill(0)
stride = 1
i = stop
for s in reversed(indices):
i -= 1
a_shape_i = a._shape[i]
# wrap all out-of-bound indices
if a_shape_i != 0:
_prepare_array_indexing(s, a_shape_i, stride, reduced_idx)
stride *= a_shape_i
return reduced_idx, start, stop
cdef _ndarray_base _getitem_multiple(
_ndarray_base a, Py_ssize_t start, list slices):
reduced_idx, start, stop = _prepare_multiple_array_indexing(
a, start, slices)
return _take(a, reduced_idx, start, stop)
cdef _ndarray_base _add_reduceat(
_ndarray_base array, indices, axis, dtype, out):
from cupy._sorting import search
axis = internal._normalize_axis_index(axis, array.ndim)
indices = cupy.append(indices, array.shape[axis])
shape = [1 if i == axis else dim for i, dim in enumerate(array.shape)]
acc = array.cumsum(axis, dtype)
acc = cupy.append(cupy.zeros(shape, acc.dtype), acc, axis)
mask = indices[:-1] >= indices[1:]
mask = mask.reshape(-1, *([1] * (array.ndim - axis - 1)))
return search._where_ufunc(
mask,
array.take(indices[:-1], axis),
acc.take(indices[1:], axis) - acc.take(indices[:-1], axis),
out
)
from cupy._core._carray cimport shape_t
from cupy._core.core cimport _ndarray_base
cpdef compute_type_to_str(compute_type)
cpdef get_compute_type(dtype)
cpdef _ndarray_base dot(_ndarray_base a, _ndarray_base b, _ndarray_base out=*)
cpdef _ndarray_base tensordot_core(
_ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t n,
Py_ssize_t m, Py_ssize_t k, const shape_t& ret_shape)
cpdef _ndarray_base matmul(
_ndarray_base a, _ndarray_base b, _ndarray_base out=*)
cpdef enum:
COMPUTE_TYPE_TBD = 0
COMPUTE_TYPE_DEFAULT = 1 # default
COMPUTE_TYPE_PEDANTIC = 2 # disable algorithmic optimizations
COMPUTE_TYPE_FP16 = 3 # allow converting inputs to FP16
COMPUTE_TYPE_FP32 = 4 # allow converting inputs to FP32
COMPUTE_TYPE_FP64 = 5 # allow converting inputs to FP64
COMPUTE_TYPE_BF16 = 6 # allow converting inputs to BF16
COMPUTE_TYPE_TF32 = 7 # allow converting inputs to TF32
import math
import os
import warnings
import cython
import numpy
import cupy
from cupy._core._kernel import ElementwiseKernel
from cupy._core._reduction import ReductionKernel
from cupy._core._ufuncs import elementwise_copy
import cupy._core.core as core
from libc.stdint cimport intptr_t
from cupy._core cimport _accelerator
from cupy._core._carray cimport shape_t
from cupy._core._dtype cimport to_cuda_dtype
from cupy._core._scalar cimport get_typename
from cupy._core.core cimport _internal_ascontiguousarray
from cupy._core.core cimport _ndarray_init
from cupy._core.core cimport ascontiguousarray
from cupy._core.core cimport _ndarray_base
from cupy._core cimport _memory_range
from cupy._core cimport _routines_manipulation as _manipulation
from cupy._core cimport _routines_math as _math
from cupy.cuda cimport device
from cupy_backends.cuda.api cimport runtime
from cupy_backends.cuda.libs cimport cublas
cdef extern from '../../cupy_backends/cupy_complex.h':
ctypedef struct cuComplex 'cuComplex':
float x, y
ctypedef struct cuDoubleComplex 'cuDoubleComplex':
double x, y
cdef int _cuda_runtime_version = -1
cdef list compute_types = [COMPUTE_TYPE_TBD, # float16
COMPUTE_TYPE_TBD, # float32
COMPUTE_TYPE_TBD] # float64
cdef dict compute_type_str = {
0: 'COMPUTE_TYPE_TBD',
1: 'COMPUTE_TYPE_DEFAULT',
2: 'COMPUTE_TYPE_PEDANTIC',
3: 'COMPUTE_TYPE_FP16',
4: 'COMPUTE_TYPE_FP32',
5: 'COMPUTE_TYPE_FP64',
6: 'COMPUTE_TYPE_BF16',
7: 'COMPUTE_TYPE_TF32',
}
cpdef int to_compute_type_index(dtype) except -1:
cdef str dtype_char = numpy.dtype(dtype).char
if dtype_char == 'e':
return 0
elif dtype_char in 'fF':
return 1
elif dtype_char in 'dD':
return 2
else:
raise TypeError('dtype is not supported: {}'.format(dtype))
cpdef set_compute_type(dtype, compute_type):
global compute_types
if compute_type in (COMPUTE_TYPE_TBD, COMPUTE_TYPE_DEFAULT,
COMPUTE_TYPE_PEDANTIC, COMPUTE_TYPE_FP16,
COMPUTE_TYPE_FP32, COMPUTE_TYPE_FP64):
compute_types[to_compute_type_index(dtype)] = compute_type
elif compute_type in (COMPUTE_TYPE_BF16, COMPUTE_TYPE_TF32):
if int(device.get_compute_capability()) >= 80:
compute_types[to_compute_type_index(dtype)] = compute_type
else:
warnings.warn('COMPUTE_TYPE_BF16 and COMPUTE_TYPE_TF32 are only '
'available on GPUs with compute capability 8.0 or '
'higher. COMPUTE_TYPE_DEFAULT will be used instead.')
compute_types[to_compute_type_index(dtype)] = COMPUTE_TYPE_DEFAULT
else:
raise ValueError('Unknown compute type: {}'.format(compute_type))
cpdef compute_type_to_str(compute_type):
if compute_type in compute_type_str:
return compute_type_str[compute_type]
else:
return compute_type
def _tensordot_core_int_kernel_impl(config, dtype, code, name):
# This code is based in the GEMM implementation from MAGMA
# (http://icl.cs.utk.edu/magma/)
code = '''
#define fetch(arr, col, m, n, bound) arr[min(n*col + m, bound)]
template<typename T>
__device__ void _tensordot_core_int_kernel_impl(
int M, int N, int K,
const T* A,
const T* B,
T * C)
{
int idx = threadIdx.x;
int idy = threadIdx.y;
int idt = DIM_X * idy + idx;
int idxA = idt % DIM_XA;
int idyA = idt / DIM_XA;
int idxB = idt % DIM_XB;
int idyB = idt / DIM_XB;
int blx = blockIdx.x;
int bly = blockIdx.y;
__shared__ T sA[BLK_K][BLK_M + 1];
__shared__ T sB[BLK_N][BLK_K + 1];
// registers for the innermost loop
T rC[THR_N][THR_M];
T rA[THR_M];
T rB[THR_N];
T ra[BLK_K / DIM_YA][BLK_M / DIM_XA];
T rb[BLK_N / DIM_YB][BLK_K / DIM_XB];
const T* offs_dA = A + blx * BLK_M + idyA * M + idxA;
int boundA = (M * (K - 1) + M) - (blx * BLK_M + idyA * M + idxA) - 1;
const T* offs_dB = B + bly * BLK_N * K + idyB * K + idxB;
int boundB = (K * (N - 1) + K) - (bly * BLK_N * K + idyB * K + idxB) - 1;
int m, n, k, kk;
#pragma unroll
for (n = 0; n < THR_N; n++) {
#pragma unroll
for (m = 0 ; m < THR_M; m++) {
rC[n][m] = 0;
}
}
// blockwise transpose to transpose load
#pragma unroll
for (n = 0; n < BLK_K; n += DIM_YA) {
#pragma unroll
for (m = 0; m < BLK_M; m += DIM_XA) {
sA[n + idyA][m + idxA] = fetch(offs_dA, M, m, n, boundA);
}
}
// blockwise transpose to transpose load
#pragma unroll
for (n = 0; n < BLK_N; n += DIM_YB) {
#pragma unroll
for (m = 0; m < BLK_K; m += DIM_XB) {
sB[n + idyB][m + idxB] = fetch(offs_dB, K, m, n, boundB);
}
}
__syncthreads();
for (kk = 0; kk < K - BLK_K; kk += BLK_K)
{
offs_dA += BLK_K * M;
boundA -= BLK_K * M;
offs_dB += BLK_K;
boundB -= BLK_K;
#pragma unroll
for (n = 0; n < BLK_K / DIM_YA; n++) {
#pragma unroll
for (m = 0; m < BLK_M / DIM_XA; m++) {
ra[n][m] = fetch(offs_dA, M, m * DIM_XA, n * DIM_YA, boundA);
}
}
#pragma unroll
for (n = 0; n < BLK_N / DIM_YB; n++) {
#pragma unroll
for (m = 0; m < BLK_K / DIM_XB; m++) {
rb[n][m] = fetch(offs_dB, K, m * DIM_XB, n * DIM_YB, boundB);
}
}
// multiply
#pragma unroll
for (k = 0; k < BLK_K; k++)
{
#pragma unroll
for (m = 0; m < THR_M; m++) {
rA[m] = sA[k][m * DIM_X + idx];
}
#pragma unroll
for (n = 0; n < THR_N; n++) {
rB[n] = sB[n * DIM_Y + idy][k];
}
// HIP is strange...
#ifdef __HIP_DEVICE_COMPILE__
__syncthreads();
#endif
#pragma unroll
for (n = 0; n < THR_N; n++) {
#pragma unroll
for (m = 0; m < THR_M; m++) {
rC[n][m] += rA[m] * rB[n];
}
}
}
__syncthreads();
// store A regs->smem
#pragma unroll
for (n = 0; n < BLK_K / DIM_YA; n++)
{
#pragma unroll
for (m = 0; m < BLK_M / DIM_XA; m++)
{
sA[n * DIM_YA + idyA][m * DIM_XA + idxA] = ra[n][m];
}
}
#pragma unroll
for (n = 0; n < BLK_N / DIM_YB; n++)
{
#pragma unroll
for (m = 0; m < BLK_K / DIM_XB; m++)
{
sB[n * DIM_YB + idyB][m * DIM_XB + idxB] = rb[n][m];
}
}
__syncthreads();
}
// Multiply last full (BLK_K) or partial block of columns of A and
// rows of B.
// It's okay that m,n exceed matrix bounds as all work is in registers
// or shared memory, and out-of-bounds rC[n][m] will not be saved later.
kk = K - kk;
#pragma unroll
for (k = 0; k < kk; k++)
{
#pragma unroll
for (m = 0; m < THR_M; m++) {
rA[m] = sA[k][m * DIM_X + idx];
}
#pragma unroll
for (n = 0; n < THR_N; n++) {
rB[n] = sB[n * DIM_Y + idy][k];
}
// HIP is strange...
#ifdef __HIP_DEVICE_COMPILE__
__syncthreads();
#endif
#pragma unroll
for (n = 0; n < THR_N; n++) {
#pragma unroll
for (m = 0; m < THR_M; m++) {
rC[n][m] += rA[m] * rB[n];
}
}
}
#pragma unroll
for (n = 0; n < THR_N; n++) {
int coord_dCn = bly * BLK_N + n * DIM_Y + idy;
#pragma unroll
for (m = 0; m < THR_M; m++) {
int coord_dCm = blx * BLK_M + m * DIM_X + idx;
if (coord_dCm < M && coord_dCn < N) {
C[coord_dCn * M + coord_dCm] = rC[n][m];
}
}
}
}
''' + code
for k, v in config:
code = '#define ' + k + ' ' + str(v) + '\n' + code
name_expressions = [f'{name}<bool>',
f'{name}<signed char>',
f'{name}<unsigned char>',
f'{name}<short>',
f'{name}<unsigned short>',
f'{name}<int>',
f'{name}<unsigned int>',
f'{name}<long>',
f'{name}<unsigned long>',
f'{name}<long long>',
f'{name}<unsigned long long>']
mod = cupy.RawModule(code=code, options=('--std=c++11',),
name_expressions=name_expressions)
ker = mod.get_function(name + '<' + get_typename(dtype) + '>')
return ker
@cupy._util.memoize(for_each_device=True)
def _tensordot_core_int_kernel(config, dtype):
code = '''
template<typename T>
__global__ void _tensordot_core_int_kernel(
int M, int N, int K,
const T* A,
const T* B,
T * C)
{
_tensordot_core_int_kernel_impl(M, N, K, A, B, C);
}
'''
name = '_tensordot_core_int_kernel'
return _tensordot_core_int_kernel_impl(config, dtype, code, name)
@cupy._util.memoize(for_each_device=True)
def _tensordot_core_int_batched_kernel(config, dtype):
code = '''
template<typename T>
__global__ void _tensordot_core_int_batched_kernel(
int M, int N, int K,
const T* A[], const T* B[],
T* C[])
{
int batchid = blockIdx.z;
_tensordot_core_int_kernel_impl(
M, N, K, A[batchid], B[batchid], C[batchid]
);
}
'''
name = '_tensordot_core_int_batched_kernel'
return _tensordot_core_int_kernel_impl(config, dtype, code, name)
@cupy._util.memoize(for_each_device=True)
def _tensordot_core_int_strided_batched_kernel(config, dtype):
code = '''
template<typename T>
__global__ void _tensordot_core_int_strided_batched_kernel(
int M, int N, int K,
const T* A, long long strideA,
const T* B, long long strideB,
T * C, long long strideC)
{
int batchid = blockIdx.z;
_tensordot_core_int_kernel_impl(
M, N, K,
&A[batchid * strideA],
&B[batchid * strideB],
&C[batchid * strideC]
);
}
'''
name = '_tensordot_core_int_strided_batched_kernel'
return _tensordot_core_int_kernel_impl(config, dtype, code, name)
cdef tuple _integral_tensordot_core_config():
# TODO(leofang): autotune the tuning parameters here? See the discussion
# in this thread: https://groups.google.com/a/icl.utk.edu/g/magma-user/c/igc66uduTfI # NOQA
dim_x=16
dim_y=16
blk_m=128
blk_n=128
blk_k=2
dim_xa=128
dim_ya=2
dim_xb=2
dim_yb=128
config = (('DIM_X', dim_x), ('DIM_Y', dim_y),
('BLK_M', blk_m), ('BLK_N', blk_n), ('BLK_K', blk_k),
('DIM_XA', dim_xa), ('DIM_YA', dim_ya),
('DIM_XB', dim_xb), ('DIM_YB', dim_yb),
('THR_M', blk_m // dim_x), ('THR_N', blk_n // dim_y))
return config, dim_x, dim_y, blk_m, blk_n
cdef _ndarray_base _integral_tensordot_core(
_ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m,
Py_ssize_t n, Py_ssize_t k, str dtype, const shape_t& ret_shape):
config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config()
kern = _tensordot_core_int_kernel(config, dtype)
args = (m, n, k, a, b, out)
grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1)
block = (dim_x, dim_y, 1)
kern(grid, block, args=args)
return out
cdef _ndarray_base _integral_tensordot_core_batched(
_ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m,
Py_ssize_t n, Py_ssize_t k, str dtype, Py_ssize_t batch_count):
config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config()
kern = _tensordot_core_int_batched_kernel(config, dtype)
block = (dim_x, dim_y, 1)
matPtrA = _mat_ptrs(a)
matPtrB = _mat_ptrs(b)
matPtrOut = _mat_ptrs(out)
max_batch_count = 65000
for i in range(0, batch_count, max_batch_count):
ibatch = min(max_batch_count, batch_count - i)
args = (
m, n, k, matPtrA[i:i + ibatch], matPtrB[i:i + ibatch],
matPtrOut[i:i + ibatch])
grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), ibatch)
kern(grid, block, args=args)
return out
cdef _ndarray_base _integral_tensordot_core_strided_batched(
_ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m,
Py_ssize_t n, Py_ssize_t k, str dtype, Py_ssize_t batch_count):
config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config()
kern = _tensordot_core_int_strided_batched_kernel(config, dtype)
block = (dim_x, dim_y, 1)
a = a.reshape((-1,) + a.shape[-2:])
b = b.reshape((-1,) + b.shape[-2:])
out = out.reshape((-1,) + out.shape[-2:])
strideA = _get_stride_for_strided_batched_gemm(a)
strideB = _get_stride_for_strided_batched_gemm(b)
strideOut = _get_stride_for_strided_batched_gemm(out)
max_batch_count = 65000
for i in range(0, batch_count, max_batch_count):
ibatch = min(max_batch_count, batch_count - i)
args = (
m, n, k, a[i:i + ibatch], strideA, b[i:i + ibatch], strideB,
out[i:i + ibatch], strideOut)
grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), ibatch)
kern(grid, block, args=args)
return out
cdef _tensordot_core_mul_sum = ReductionKernel(
'S x, T y', 'U out',
'static_cast<U>(x) * static_cast<U>(y)',
'a + b', 'out = a', '0', '_tensordot_core_mul_sum')
cpdef get_compute_type(dtype):
global compute_types
cdef int index = to_compute_type_index(dtype)
if compute_types[index] == COMPUTE_TYPE_TBD:
compute_type = COMPUTE_TYPE_DEFAULT
dtype_char = numpy.dtype(dtype).char
if dtype_char in 'fF' and int(os.getenv('CUPY_TF32', '0')) > 0:
compute_type = COMPUTE_TYPE_TF32
set_compute_type(dtype, compute_type)
return compute_types[index]
@cython.profile(False)
cpdef inline tuple _mat_to_cublas_contiguous(
_ndarray_base a, Py_ssize_t trans):
assert a.ndim == 2
if a._f_contiguous:
# builtin max function is not used for Cython 0.23
lda = a._strides[1] // a.itemsize
if lda < a._shape[0]:
lda = a._shape[0]
return a, trans, lda
if not a._c_contiguous:
a = a.copy()
return a, 1 - trans, a._strides[0] // a.itemsize
cpdef _ndarray_base dot(
_ndarray_base a, _ndarray_base b, _ndarray_base out=None):
cdef Py_ssize_t a_ndim, b_ndim, a_axis, b_axis, n, m, k
cdef bint input_a_is_vec, input_b_is_vec
cdef shape_t ret_shape, shape
a_ndim = a._shape.size()
b_ndim = b._shape.size()
if out is not None:
if numpy.result_type(a.dtype, b.dtype) != out.dtype:
raise ValueError('Not supported dtype combination.')
if not out._c_contiguous:
raise ValueError('Output array must be C-contiguous')
if a_ndim == 0 or b_ndim == 0:
return _math._multiply(a, b, out=out)
input_a_is_vec = a_ndim == 1
input_b_is_vec = b_ndim == 1
if input_a_is_vec:
shape.clear()
shape.push_back(1)
shape.push_back(a.size)
a = _manipulation._reshape(a, shape)
a_ndim = 2
if input_b_is_vec:
shape.clear()
shape.push_back(b.size)
shape.push_back(1)
b = _manipulation._reshape(b, shape)
b_ndim = 2
a_axis = a_ndim - 1
b_axis = b_ndim - 2
if a._shape[a_axis] != b._shape[b_axis]:
raise ValueError('Axis dimension mismatch')
if a_axis:
a = _manipulation.rollaxis(a, a_axis, 0)
if b_axis:
b = _manipulation.rollaxis(b, b_axis, 0)
k = a._shape[0]
if k != 0:
m = b.size // k
n = a.size // k
else:
# When k==0, the function must return a matrix filled with zero
# like NumPy.
m = 0
n = 0
if not input_a_is_vec:
ret_shape.insert(ret_shape.end(), a._shape.begin() + 1, a._shape.end())
if not input_b_is_vec:
ret_shape.insert(ret_shape.end(), b._shape.begin() + 1, b._shape.end())
if out is not None:
# TODO(kataoka): Make the condition strict
if k != 0 and out.size != n * m:
raise ValueError('Output array has an invalid size')
return tensordot_core(a, b, out, n, m, k, ret_shape)
cpdef _ndarray_base tensordot_core(
_ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t n,
Py_ssize_t m, Py_ssize_t k, const shape_t& ret_shape):
# out, if specified, must be C-contiguous and have correct shape.
cdef shape_t shape
cdef Py_ssize_t transa, transb, lda, ldb
cdef intptr_t handle
cdef _ndarray_base copy_to_out = None
cdef str dtype = a.dtype.char
cdef int compute_capability = int(device.get_compute_capability())
if dtype != b.dtype.char:
dtype = numpy.promote_types(dtype, b.dtype).char
if not a.size or not b.size:
if out is None:
out = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
out.fill(0)
return out
if out is not None:
assert out.flags.c_contiguous and out.dtype == dtype
cdef int ace
if m == 1 and n == 1:
if out is None:
out = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
c = _manipulation._reshape(out, ())
for ace in _accelerator._routine_accelerators:
# fast path using CUB or cuTENSOR
if ace in (_accelerator.ACCELERATOR_CUB,
_accelerator.ACCELERATOR_CUTENSOR):
(a.ravel() * b.ravel()).sum(out=c)
break
else:
_tensordot_core_mul_sum(a.ravel(), b.ravel(), out=c)
return out
a = a.astype(dtype, order='K', casting=None, subok=None, copy=False)
b = b.astype(dtype, order='K', casting=None, subok=None, copy=False)
# It copies the operands if needed
if a._shape.size() != 2 or a._shape[0] != k or a._shape[1] != n:
shape.clear()
shape.push_back(k)
shape.push_back(n)
a = _manipulation._reshape(a, shape)
if b._shape.size() != 2 or b._shape[0] != k or b._shape[1] != m:
shape.clear()
shape.push_back(k)
shape.push_back(m)
b = _manipulation._reshape(b, shape)
# Be careful that cuBLAS uses the FORTRAN-order matrix representation.
# Matrix-Matrix product A^T * B
# c is C-contiguous while cuBLAS assumes F-contiguous inputs, so we
# compute C^T = B^T * A here.
a, transa, lda = _mat_to_cublas_contiguous(a, 0)
b, transb, ldb = _mat_to_cublas_contiguous(b, 1)
if out is None:
out = c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
elif (
_memory_range.may_share_bounds(out, a)
or _memory_range.may_share_bounds(out, b)
):
copy_to_out = c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
else:
c = out
if c._shape.size() != 2 or c._shape[0] != n or c._shape[1] != m:
c = c.view()
c.shape = (n, m)
if dtype not in 'efdFD':
if transa:
a = a.T
a = _internal_ascontiguousarray(a)
if transb:
b = _internal_ascontiguousarray(b)
_integral_tensordot_core(b, a, c, m, n, k, dtype, ret_shape)
if copy_to_out is not None:
elementwise_copy(copy_to_out, out)
return out
global _cuda_runtime_version
if _cuda_runtime_version < 0:
_cuda_runtime_version = runtime.runtimeGetVersion()
if (
not runtime._is_hip_environment and
_cuda_runtime_version >= 11000 and
compute_capability >= 50
):
tensordot_core_v11(transb, transa, m, n, k, b, ldb, a, lda, c, m)
if copy_to_out is not None:
elementwise_copy(copy_to_out, out)
return out
handle = device.get_cublas_handle()
if dtype == 'e':
coef_dtype = 'f'
else:
coef_dtype = dtype
one = numpy.array(1.0, dtype=coef_dtype)
zero = numpy.array(0.0, dtype=coef_dtype)
if runtime._is_hip_environment and dtype == 'e':
# On HIP, SgemmEx does not work for half precision
dtype = 'f'
a = a.astype(dtype, order='K', casting=None, subok=None, copy=True)
b = b.astype(dtype, order='K', casting=None, subok=None, copy=True)
c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
copy_to_out = c
warnings.warn('On ROCm/HIP, there is no specialized API to handle '
'half precision floating numbers, so the computation '
'will be done by casting to single precision')
if dtype == 'e':
use_tensor_core = (not runtime._is_hip_environment and
_cuda_runtime_version >= 9000 and
compute_capability >= 70)
if use_tensor_core:
cublas.setMathMode(handle, cublas.CUBLAS_TENSOR_OP_MATH)
cublas.gemmEx(
handle, <int>transb, <int> transa, <int>m, <int>n, <int>k,
one.ctypes.data, b.data.ptr, runtime.CUDA_R_16F, <int>ldb,
a.data.ptr, runtime.CUDA_R_16F, <int>lda, zero.ctypes.data,
c.data.ptr, runtime.CUDA_R_16F, <int>m, runtime.CUDA_R_32F,
cublas.CUBLAS_GEMM_DEFAULT_TENSOR_OP)
cublas.setMathMode(handle, cublas.CUBLAS_DEFAULT_MATH)
else:
cublas.sgemmEx(
handle, <int>transb, <int> transa, <int>m, <int>n, <int>k,
one.ctypes.data, b.data.ptr, runtime.CUDA_R_16F, <int>ldb,
a.data.ptr, runtime.CUDA_R_16F, <int>lda, zero.ctypes.data,
c.data.ptr, runtime.CUDA_R_16F, <int>m)
elif dtype == 'f':
cublas.sgemmEx(
handle, <int>transb, <int> transa, <int>m, <int>n, <int>k,
one.ctypes.data, b.data.ptr, runtime.CUDA_R_32F, <int>ldb,
a.data.ptr, runtime.CUDA_R_32F, <int>lda, zero.ctypes.data,
c.data.ptr, runtime.CUDA_R_32F, <int>m)
elif dtype == 'd':
cublas.dgemm(
handle, <int>transb, <int>transa, <int>m, <int>n, <int>k,
one.ctypes.data, b.data.ptr, <int>ldb, a.data.ptr, <int>lda,
zero.ctypes.data, c.data.ptr, <int>m)
elif dtype == 'F':
cublas.cgemm(
handle, <int>transb, <int>transa, <int>m, <int>n, <int>k,
one.ctypes.data, b.data.ptr, <int>ldb, a.data.ptr, <int>lda,
zero.ctypes.data, c.data.ptr, <int>m)
elif dtype == 'D':
cublas.zgemm(
handle, <int>transb, <int>transa, <int>m, <int>n, <int>k,
one.ctypes.data, b.data.ptr, <int>ldb, a.data.ptr, <int>lda,
zero.ctypes.data, c.data.ptr, <int>m)
else:
raise ValueError('Invalid dtype: %s' % str(dtype))
if copy_to_out is not None:
elementwise_copy(copy_to_out, out)
return out
cpdef _ndarray_base tensordot_core_v11(
Py_ssize_t transa, Py_ssize_t transb, Py_ssize_t m, Py_ssize_t n,
Py_ssize_t k, _ndarray_base a, Py_ssize_t lda, _ndarray_base b,
Py_ssize_t ldb, _ndarray_base c, Py_ssize_t ldc):
cdef float one_f, zero_f
cdef double one_d, zero_d
cdef cuComplex one_F, zero_F
cdef cuDoubleComplex one_D, zero_D
cdef size_t one_ptr, zero_ptr
cdef int compute_capability = int(device.get_compute_capability())
cdef int compute_type = get_compute_type(c.dtype)
cdef int cublas_compute_type = -1
if c.dtype.char in 'efF':
if compute_type == COMPUTE_TYPE_PEDANTIC:
cublas_compute_type = cublas.CUBLAS_COMPUTE_32F_PEDANTIC
elif compute_type == COMPUTE_TYPE_TF32 and c.dtype.char in 'fF':
cublas_compute_type = cublas.CUBLAS_COMPUTE_32F_FAST_TF32
else:
cublas_compute_type = cublas.CUBLAS_COMPUTE_32F
elif c.dtype.char in 'dD':
if compute_type == COMPUTE_TYPE_PEDANTIC:
cublas_compute_type = cublas.CUBLAS_COMPUTE_64F_PEDANTIC
else:
cublas_compute_type = cublas.CUBLAS_COMPUTE_64F
else:
raise ValueError('Invalid dtype: {}'.format(c.dtype))
cdef int algo = cublas.CUBLAS_GEMM_DEFAULT
if ((compute_capability >= 80) or
(compute_capability >= 70 and c.dtype == 'e')):
algo = cublas.CUBLAS_GEMM_DEFAULT_TENSOR_OP
if cublas_compute_type in (cublas.CUBLAS_COMPUTE_32F,
cublas.CUBLAS_COMPUTE_32F_PEDANTIC,
cublas.CUBLAS_COMPUTE_32F_FAST_TF32):
if c.dtype.char in 'efd':
one_f = 1
zero_f = 0
one_ptr = <size_t>&one_f
zero_ptr = <size_t>&zero_f
else:
one_F = cuComplex(1, 0)
zero_F = cuComplex(0, 0)
one_ptr = <size_t>&one_F
zero_ptr = <size_t>&zero_F
elif cublas_compute_type in (cublas.CUBLAS_COMPUTE_64F,
cublas.CUBLAS_COMPUTE_64F_PEDANTIC):
if c.dtype.char in 'efd':
one_d = 1
zero_d = 0
one_ptr = <size_t>&one_d
zero_ptr = <size_t>&zero_d
else:
one_D = cuDoubleComplex(1, 0)
zero_D = cuDoubleComplex(0, 0)
one_ptr = <size_t>&one_D
zero_ptr = <size_t>&zero_D
else:
raise ValueError('Invalid cublas compute type: {}'
.format(cublas_compute_type))
cdef int a_cuda_dtype = to_cuda_dtype(a.dtype, is_half_allowed=True)
cdef int b_cuda_dtype = to_cuda_dtype(b.dtype, is_half_allowed=True)
cdef int c_cuda_dtype = to_cuda_dtype(c.dtype, is_half_allowed=True)
cdef intptr_t handle = device.get_cublas_handle()
cublas.gemmEx(
handle, <int>transa, <int>transb, <int>m, <int>n, <int>k, one_ptr,
a.data.ptr, a_cuda_dtype, <int>lda, b.data.ptr, b_cuda_dtype, <int>ldb,
zero_ptr, c.data.ptr, c_cuda_dtype, <int>ldc, cublas_compute_type,
algo)
cdef Py_ssize_t _get_stride_for_strided_batched_gemm(
_ndarray_base a) except? 0:
cdef int ndim = a._shape.size()
assert ndim > 2
assert a._c_contiguous
return a._shape[ndim - 2] * a._shape[ndim - 1]
cdef _mat_ptrs_kernel = ElementwiseKernel(
'T base, T stride', 'T out',
'out = base + _ind.get()[_ind.ndim - 1] * stride', 'cupy_mat_ptrs',
reduce_dims=False)
cpdef _ndarray_base _mat_ptrs(_ndarray_base a):
"""Creates an array of pointers to matrices
Args:
a: A batch of matrices on GPU.
shape: (A, B, C) -> A ptrs to mat of size (B, C)
shape: (A_1, ..., A_N, B, C) -> A_1*...*A_N ptrs to mat of
size (B, C)
Returns:
GPU array of pointers to matrices.
"""
cdef int ndim = a._shape.size()
assert ndim > 2
cdef _ndarray_base idx
idx = _mat_ptrs_kernel(
a.data.ptr, a._strides[0],
core.ndarray((a._shape[0],), dtype=numpy.uintp))
for i in range(1, ndim - 2):
idx = _mat_ptrs_kernel(
idx[:, None], a._strides[i],
core.ndarray((idx.size, a._shape[i]), dtype=numpy.uintp))
idx = idx.ravel()
return idx
cpdef _ndarray_base matmul(
_ndarray_base a, _ndarray_base b, _ndarray_base out=None):
"""Matrix product of two arrays.
Returns the matrix product of two arrays and is the implementation of
the `@` operator introduced in Python 3.5 following PEP465.
The main difference against cupy.dot are the handling of arrays with more
than 2 dimensions. For more information see :func:`numpy.matmul`.
Args:
a (cupy.ndarray): The left argument.
b (cupy.ndarray): The right argument.
out (cupy.ndarray): Output array.
Returns:
cupy.ndarray: Output array.
.. seealso:: :func:`numpy.matmul`
"""
cdef Py_ssize_t i, n, m, ka, kb, a_sh, b_sh, c_sh, ldc
cdef Py_ssize_t batchCount, a_part_outshape, b_part_outshape
cdef int orig_a_ndim, orig_b_ndim, a_ndim, b_ndim, ndim
cdef _ndarray_base ap, bp, cp, c_view
cdef bint use_broadcast
orig_a_ndim = a._shape.size()
orig_b_ndim = b._shape.size()
if orig_a_ndim == 0 or orig_b_ndim == 0:
raise ValueError('Scalar operands are not allowed, use \'*\' instead')
ndim = max(orig_a_ndim, orig_b_ndim)
if ndim <= 2:
if out is None:
return dot(a, b, out)
ret_dtype = numpy.promote_types(a.dtype, b.dtype)
if out._c_contiguous and ret_dtype == out.dtype:
return dot(a, b, out)
c = _ndarray_init(cupy.ndarray, out._shape, dtype=ret_dtype, obj=None)
dot(a, b, c)
elementwise_copy(c, out)
return out
orig_a = a
orig_b = b
a_part_outshape = b_part_outshape = 0
if orig_a_ndim == 1:
a = _manipulation._reshape(a, (1, a.size))
else:
a = a.view()
a_part_outshape = a._shape[orig_a_ndim - 2]
if orig_b_ndim == 1:
b = _manipulation._reshape(b, (b.size, 1))
ldc = 1
else:
b = b.view()
b_part_outshape = ldc = b._shape[orig_b_ndim - 1]
# expand dims
a_ndim = a._shape.size()
b_ndim = b._shape.size()
if a_ndim < ndim:
# TODO(niboshi): Confirm update_x_contiguity flags
a._set_shape_and_strides(
(1,) * (ndim - a_ndim) + a.shape,
(0,) * (ndim - a_ndim) + a.strides,
True, True)
if b_ndim < ndim:
# TODO(niboshi): Confirm update_x_contiguity flags
b._set_shape_and_strides(
(1,) * (ndim - b_ndim) + b.shape,
(0,) * (ndim - b_ndim) + b.strides,
True, True)
ret_dtype = numpy.promote_types(a.dtype, b.dtype)
dtype = ret_dtype
if dtype.char == 'e':
dtype = numpy.dtype('f')
a = ascontiguousarray(a, dtype)
b = ascontiguousarray(b, dtype)
# broadcast
batchCount = 1 # batchCount = numpy.prod(out_shape[:-2])
out_shape = []
use_broadcast = False
for i in range(0, ndim - 2):
a_sh = a._shape[i]
b_sh = b._shape[i]
if a_sh != b_sh and a_sh != 1 and b_sh != 1:
raise ValueError(
'operands could not be broadcast together with '
'remapped shapes')
if a_sh == 0 or b_sh == 0:
c_sh = 0
else:
c_sh = max(a_sh, b_sh)
batchCount *= c_sh
out_shape.append(c_sh)
if a_sh == 1 and c_sh > 1:
a._strides[i] = 0
a._shape[i] = c_sh
a._c_contiguous = a._f_contiguous = False
use_broadcast = True
if b_sh == 1 and c_sh > 1:
b._strides[i] = 0
b._shape[i] = c_sh
b._c_contiguous = b._f_contiguous = False
use_broadcast = True
if orig_a_ndim != 1:
out_shape.append(a_part_outshape)
if orig_b_ndim != 1:
out_shape.append(b_part_outshape)
# (A B)^T = B^T A^T
a, b = b, a
ka = a._shape[ndim - 2]
lda = n = a._shape[ndim - 1]
m = b._shape[ndim - 2]
ldb = kb = b._shape[ndim - 1]
if ka != kb:
raise ValueError(
'shapes ({}) and ({}) not aligned'.format(
','.join([str(_) for _ in orig_a.shape]),
','.join([str(_) for _ in orig_b.shape])))
if out is not None and out.shape != tuple(out_shape):
raise ValueError('Output array has an invalid size')
if a.size == 0 or b.size == 0:
if out is None:
return cupy.zeros(out_shape, ret_dtype)
else:
out.fill(0)
return out
if (
out is not None and out.dtype == dtype and out.flags.c_contiguous
and not _memory_range.may_share_bounds(out, a)
and not _memory_range.may_share_bounds(out, b)
):
c = out
else:
c = core.ndarray(out_shape, dtype=dtype)
if out is None:
if dtype == ret_dtype:
out = c
else:
out = core.ndarray(out_shape, dtype=ret_dtype)
if orig_a_ndim == 1 or orig_b_ndim == 1:
c_view = c.view()
if orig_b_ndim == 1:
c_view._shape.push_back(1)
c_view._strides.push_back(0)
if orig_a_ndim == 1:
c_view._shape.insert(c_view._shape.end() - 1, 1)
c_view._strides.insert(c_view._strides.end() - 1, 0)
assert c_view._c_contiguous
c_view._update_f_contiguity()
else:
c_view = c
if dtype.char not in 'efdFD':
if not use_broadcast:
_integral_tensordot_core_strided_batched(
a, b, c_view, n, m, ka, dtype.char, batchCount)
else:
_integral_tensordot_core_batched(
a, b, c_view, n, m, ka, dtype.char, batchCount)
if out is not c:
elementwise_copy(c, out)
return out
global _cuda_runtime_version
if _cuda_runtime_version < 0:
_cuda_runtime_version = runtime.runtimeGetVersion()
cdef intptr_t handle = device.get_cublas_handle()
cdef int cuda_dtype = to_cuda_dtype(dtype)
cdef int algo = cublas.CUBLAS_GEMM_DEFAULT
one = numpy.array(1, dtype=dtype)
zero = numpy.array(0, dtype=dtype)
if not use_broadcast:
strideA = _get_stride_for_strided_batched_gemm(a)
strideB = _get_stride_for_strided_batched_gemm(b)
strideC = _get_stride_for_strided_batched_gemm(c_view)
if dtype.char in 'fFdD':
cublas.gemmStridedBatchedEx(
handle,
0, # transa
0, # transb
n, m, ka, one.ctypes.data,
a.data.ptr, cuda_dtype, lda, strideA,
b.data.ptr, cuda_dtype, ldb, strideB,
zero.ctypes.data,
c_view.data.ptr, cuda_dtype, ldc, strideC,
batchCount, cuda_dtype, algo)
else:
raise TypeError(dtype, a.dtype, b.dtype)
else:
ap = _mat_ptrs(a)
bp = _mat_ptrs(b)
cp = _mat_ptrs(c_view)
if dtype == numpy.float32:
cublas.sgemmBatched(
handle,
0, # transa
0, # transb
n, m, ka, one.ctypes.data,
ap.data.ptr, lda,
bp.data.ptr, ldb,
zero.ctypes.data, cp.data.ptr, ldc, batchCount)
elif dtype == numpy.float64:
cublas.dgemmBatched(
handle,
0, # transa
0, # transb
n, m, ka, one.ctypes.data,
ap.data.ptr, lda,
bp.data.ptr, ldb,
zero.ctypes.data, cp.data.ptr, ldc, batchCount)
elif dtype == numpy.complex64:
cublas.cgemmBatched(
handle,
0, # transa
0, # transb
n, m, ka, one.ctypes.data,
ap.data.ptr, lda,
bp.data.ptr, ldb,
zero.ctypes.data, cp.data.ptr, ldc, batchCount)
elif dtype == numpy.complex128:
cublas.zgemmBatched(
handle,
0, # transa
0, # transb
n, m, ka, one.ctypes.data,
ap.data.ptr, lda,
bp.data.ptr, ldb,
zero.ctypes.data, cp.data.ptr, ldc, batchCount)
else:
raise TypeError(dtype, a.dtype, b.dtype)
if out is not c:
elementwise_copy(c, out)
return out
from cupy._core.core cimport _ndarray_base
cdef _ndarray_base _ndarray_all(_ndarray_base self, axis, out, keepdims)
cdef _ndarray_base _ndarray_any(_ndarray_base self, axis, out, keepdims)
cdef _ndarray_base _ndarray_greater(_ndarray_base self, other)
cdef _ndarray_base _ndarray_greater_equal(_ndarray_base self, other)
cdef _ndarray_base _ndarray_less(_ndarray_base self, other)
cdef _ndarray_base _ndarray_less_equal(_ndarray_base self, other)
cdef _ndarray_base _ndarray_equal(_ndarray_base self, other)
cdef _ndarray_base _ndarray_not_equal(_ndarray_base self, other)
from cupy._core._kernel import create_ufunc
from cupy._core._reduction import create_reduction_func
from cupy._core.core cimport _ndarray_base
cdef _ndarray_base _ndarray_all(_ndarray_base self, axis, out, keepdims):
return _all(self, axis=axis, out=out, keepdims=keepdims)
cdef _ndarray_base _ndarray_any(_ndarray_base self, axis, out, keepdims):
return _any(self, axis=axis, out=out, keepdims=keepdims)
cdef _ndarray_base _ndarray_greater(_ndarray_base self, other):
return _greater(self, other)
cdef _ndarray_base _ndarray_greater_equal(_ndarray_base self, other):
return _greater_equal(self, other)
cdef _ndarray_base _ndarray_less(_ndarray_base self, other):
return _less(self, other)
cdef _ndarray_base _ndarray_less_equal(_ndarray_base self, other):
return _less_equal(self, other)
cdef _ndarray_base _ndarray_equal(_ndarray_base self, other):
return _equal(self, other)
cdef _ndarray_base _ndarray_not_equal(_ndarray_base self, other):
return _not_equal(self, other)
cdef _all = create_reduction_func(
'cupy_all',
('?->?', 'B->?', 'h->?', 'H->?', 'i->?', 'I->?', 'l->?', 'L->?',
'q->?', 'Q->?', 'e->?', 'f->?', 'd->?', 'F->?', 'D->?'),
('in0 != type_in0_raw(0)', 'a & b', 'out0 = a', 'bool'),
'true', '')
cdef _any = create_reduction_func(
'cupy_any',
('?->?', 'B->?', 'h->?', 'H->?', 'i->?', 'I->?', 'l->?', 'L->?',
'q->?', 'Q->?', 'e->?', 'f->?', 'd->?', 'F->?', 'D->?'),
('in0 != type_in0_raw(0)', 'a | b', 'out0 = a', 'bool'),
'false', '')
cpdef create_comparison(name, op, doc='', no_complex_dtype=True):
if no_complex_dtype:
ops = ('??->?', 'bb->?', 'BB->?', 'hh->?', 'HH->?', 'ii->?', 'II->?',
'll->?', 'LL->?', 'qq->?', 'QQ->?', 'ee->?', 'ff->?', 'dd->?')
else:
ops = ('??->?', 'bb->?', 'BB->?', 'hh->?', 'HH->?', 'ii->?', 'II->?',
'll->?', 'LL->?', 'qq->?', 'QQ->?', 'ee->?', 'ff->?', 'dd->?',
'FF->?', 'DD->?')
return create_ufunc(
'cupy_' + name,
ops,
'out0 = in0 %s in1' % op,
doc=doc)
cdef _greater = create_comparison(
'greater', '>',
'''Tests elementwise if ``x1 > x2``.
.. seealso:: :data:`numpy.greater`
''',
no_complex_dtype=False)
cdef _greater_equal = create_comparison(
'greater_equal', '>=',
'''Tests elementwise if ``x1 >= x2``.
.. seealso:: :data:`numpy.greater_equal`
''',
no_complex_dtype=False)
cdef _less = create_comparison(
'less', '<',
'''Tests elementwise if ``x1 < x2``.
.. seealso:: :data:`numpy.less`
''',
no_complex_dtype=False)
cdef _less_equal = create_comparison(
'less_equal', '<=',
'''Tests elementwise if ``x1 <= x2``.
.. seealso:: :data:`numpy.less_equal`
''',
no_complex_dtype=False)
cdef _equal = create_comparison(
'equal', '==',
'''Tests elementwise if ``x1 == x2``.
.. seealso:: :data:`numpy.equal`
''',
no_complex_dtype=False)
cdef _not_equal = create_comparison(
'not_equal', '!=',
'''Tests elementwise if ``x1 != x2``.
.. seealso:: :data:`numpy.equal`
''',
no_complex_dtype=False)
# Variables to expose to Python
# (cythonized data cannot be exposed to Python, even with cpdef.)
all = _all
any = _any
greater = _greater
greater_equal = _greater_equal
less = _less
less_equal = _less_equal
equal = _equal
not_equal = _not_equal
from libcpp cimport vector
from cupy._core._carray cimport shape_t
from cupy._core._carray cimport strides_t
from cupy._core.core cimport _ndarray_base
cdef class broadcast:
cdef:
readonly tuple values
readonly tuple shape
readonly Py_ssize_t size
readonly Py_ssize_t nd
cdef _ndarray_shape_setter(_ndarray_base self, newshape)
cdef _ndarray_base _ndarray_reshape(_ndarray_base self, tuple shape, order)
cdef _ndarray_base _ndarray_transpose(_ndarray_base self, tuple axes)
cdef _ndarray_base _ndarray_swapaxes(
_ndarray_base self, Py_ssize_t axis1, Py_ssize_t axis2)
cdef _ndarray_base _ndarray_flatten(_ndarray_base self, order)
cdef _ndarray_base _ndarray_ravel(_ndarray_base self, order)
cdef _ndarray_base _ndarray_squeeze(_ndarray_base self, axis)
cdef _ndarray_base _ndarray_repeat(_ndarray_base self, repeats, axis)
cpdef _ndarray_base _expand_dims(_ndarray_base a, tuple axis)
cpdef _ndarray_base moveaxis(_ndarray_base a, source, destination)
cpdef _ndarray_base _move_single_axis(
_ndarray_base a, Py_ssize_t source, Py_ssize_t destination)
cpdef _ndarray_base rollaxis(
_ndarray_base a, Py_ssize_t axis, Py_ssize_t start=*)
cpdef _ndarray_base broadcast_to(_ndarray_base array, shape)
cpdef _ndarray_base _reshape(_ndarray_base self, const shape_t &shape_spec)
cpdef _ndarray_base _T(_ndarray_base self)
cpdef _ndarray_base _transpose(
_ndarray_base self, const vector.vector[Py_ssize_t] &axes)
cpdef _ndarray_base _concatenate(
list arrays, Py_ssize_t axis, tuple shape, _ndarray_base out, str casting)
cpdef _ndarray_base concatenate_method(
tup, int axis, _ndarray_base out=*, dtype=*, casting=*)
# distutils: language = c++
import functools
import numpy
from cupy._core._kernel import ElementwiseKernel
from cupy._core._ufuncs import elementwise_copy
import cupy._core.core as core
cimport cpython # NOQA
cimport cython # NOQA
from libcpp cimport vector
from cupy._core._dtype cimport get_dtype, _raise_if_invalid_cast
from cupy._core cimport core
from cupy._core.core cimport _ndarray_base
from cupy._core cimport internal
from cupy._core._kernel cimport _check_peer_access, _preprocess_args
from cupy.cuda import device
@cython.final
cdef class broadcast:
"""Object that performs broadcasting.
CuPy actually uses this class to support broadcasting in various
operations. Note that this class does not provide an iterator.
Args:
arrays (tuple of arrays): Arrays to be broadcasted.
Attributes:
~broadcast.shape (tuple of ints): The broadcasted shape.
nd (int): Number of dimensions of the broadcasted shape.
~broadcast.size (int): Total size of the broadcasted shape.
values (list of arrays): The broadcasted arrays.
.. seealso:: :class:`numpy.broadcast`
"""
def __init__(self, *arrays):
cdef shape_t shape
cdef list val = list(arrays)
internal._broadcast_core(val, shape)
self.values = tuple(val)
self.shape = tuple(shape)
self.nd = <Py_ssize_t>shape.size()
self.size = internal.prod(shape)
# _ndarray_base members
cdef _ndarray_shape_setter(_ndarray_base self, newshape):
cdef shape_t shape, strides
if not cpython.PySequence_Check(newshape):
newshape = (newshape,)
shape = internal.infer_unknown_dimension(newshape, self.size)
_get_strides_for_nocopy_reshape(self, shape, strides)
if strides.size() != shape.size():
raise AttributeError(
'Incompatible shape for in-place modification. Use `.reshape()` '
'to make a copy with the desired shape.')
self._set_shape_and_strides(shape, strides, False, True)
cdef _ndarray_base _ndarray_reshape(_ndarray_base self, tuple shape, order):
cdef int order_char = internal._normalize_order(order, False)
if len(shape) == 1 and cpython.PySequence_Check(shape[0]):
shape = tuple(shape[0])
if order_char == b'A':
if self._f_contiguous and not self._c_contiguous:
order_char = b'F'
else:
order_char = b'C'
if order_char == b'C':
return _reshape(self, shape)
else:
# TODO(grlee77): Support order within _reshape instead
# The Fortran-ordered case is equivalent to:
# 1.) reverse the axes via transpose
# 2.) C-ordered reshape using reversed shape
# 3.) reverse the axes via transpose
return _T(_reshape(_T(self), shape[::-1]))
cdef _ndarray_base _ndarray_transpose(_ndarray_base self, tuple axes):
if len(axes) == 0:
return _T(self)
if len(axes) == 1:
a = axes[0]
if a is None:
return _T(self)
elif cpython.PySequence_Check(a):
axes = tuple(a)
return _transpose(self, axes)
cdef _ndarray_base _ndarray_swapaxes(
_ndarray_base self, Py_ssize_t axis1, Py_ssize_t axis2):
cdef Py_ssize_t ndim = self.ndim
cdef vector.vector[Py_ssize_t] axes
if axis1 < -ndim or axis1 >= ndim or axis2 < -ndim or axis2 >= ndim:
raise ValueError('Axis out of range')
axis1 %= ndim
axis2 %= ndim
for i in range(ndim):
axes.push_back(i)
axes[axis1], axes[axis2] = axes[axis2], axes[axis1]
return _transpose(self, axes)
cdef _ndarray_base _ndarray_flatten(_ndarray_base self, order):
cdef int order_char
cdef vector.vector[Py_ssize_t] axes
order_char = internal._normalize_order(order, True)
if order_char == b'A':
if self._f_contiguous and not self._c_contiguous:
order_char = b'F'
else:
order_char = b'C'
if order_char == b'C':
return _ndarray_flatten_order_c(self)
elif order_char == b'F':
return _ndarray_flatten_order_c(_T(self))
elif order_char == b'K':
axes = _npyiter_k_order_axes(self.strides)
return _ndarray_flatten_order_c(_transpose(self, axes))
cdef _ndarray_base _ndarray_flatten_order_c(_ndarray_base self):
newarray = self.copy(order='C')
newarray._shape.assign(<Py_ssize_t>1, self.size)
newarray._strides.assign(<Py_ssize_t>1,
<Py_ssize_t>self.itemsize)
newarray._c_contiguous = True
newarray._f_contiguous = True
return newarray
cdef vector.vector[Py_ssize_t] _npyiter_k_order_axes(strides_t& strides):
# output transpose axes such that
# x.flatten(order="K") == x.transpose(axes).flatten(order="C")
# by reproducing `npyiter_find_best_axis_ordering`
# in numpy/core/src/multiarray/nditer_constr.c
# Note that `flatten` and `ravel` should use this function for order="K",
# while `copy(order="K")` should use `internal._get_strides_for_order_K`.
cdef vector.vector[Py_ssize_t] axes
cdef Py_ssize_t stride0, stride1
cdef int ndim, i0, i1, ipos, k
ndim = strides.size()
for i0 in reversed(range(ndim)):
stride0 = abs(strides[i0])
if stride0 == 0: # ambiguous
axes.insert(axes.begin(), i0)
continue
ipos = 0
for k, i1 in enumerate(axes):
stride1 = abs(strides[i1])
if stride1 == 0: # ambiguous
continue
elif stride1 <= stride0: # shouldswap = false
break
else: # shouldswap = true
ipos = k + 1
axes.insert(axes.begin() + ipos, i0)
return axes
cdef _ndarray_base _ndarray_ravel(_ndarray_base self, order):
cdef int order_char
cdef shape_t shape
cdef vector.vector[Py_ssize_t] axes
shape.push_back(self.size)
order_char = internal._normalize_order(order, True)
if order_char == b'A':
if self._f_contiguous and not self._c_contiguous:
order_char = b'F'
else:
order_char = b'C'
if order_char == b'C':
return _reshape(self, shape)
elif order_char == b'F':
return _reshape(_T(self), shape)
elif order_char == b'K':
axes = _npyiter_k_order_axes(self.strides)
return _reshape(_transpose(self, axes), shape)
cdef _ndarray_base _ndarray_squeeze(_ndarray_base self, axis):
cdef vector.vector[char] axis_flags
cdef shape_t newshape
cdef strides_t newstrides
cdef Py_ssize_t ndim, naxes, _axis
ndim = self._shape.size()
axis_flags = vector.vector[char](ndim, 0)
# Convert axis to boolean flag.
if axis is None:
for idim in range(ndim):
if self._shape[idim] == 1:
axis_flags[idim] = 1
elif isinstance(axis, tuple):
naxes = <Py_ssize_t>len(axis)
for i in range(naxes):
_axis = internal._normalize_axis_index(<Py_ssize_t>axis[i], ndim)
if axis_flags[_axis] == 1:
raise ValueError('duplicate value in \'axis\'')
axis_flags[_axis] = 1
else:
_axis = <Py_ssize_t>axis
if ndim == 0 and (_axis == 0 or _axis == -1):
# Special case letting axis={-1,0} slip through for scalars,
# for backwards compatibility reasons.
pass
else:
_axis = internal._normalize_axis_index(_axis, ndim)
axis_flags[_axis] = 1
# Verify that the axes requested are all of size one
any_ones = 0
for idim in range(ndim):
if axis_flags[idim] != 0:
if self._shape[idim] == 1:
any_ones = 1
else:
raise ValueError('cannot select an axis to squeeze out '
'which has size not equal to one')
# If there were no axes to squeeze out, return the same array
if any_ones == 0:
return self
for i in range(ndim):
if axis_flags[i] == 0:
newshape.push_back(self._shape[i])
newstrides.push_back(self._strides[i])
v = self.view()
# TODO(niboshi): Confirm update_x_contiguity flags
v._set_shape_and_strides(newshape, newstrides, False, True)
return v
cdef _ndarray_base _ndarray_repeat(_ndarray_base self, repeats, axis):
return _repeat(self, repeats, axis)
# exposed
cpdef _ndarray_base _expand_dims(_ndarray_base a, tuple axis):
cdef vector.vector[Py_ssize_t] normalized_axis
cdef out_ndim = a.ndim + len(axis)
cdef shape_t a_shape = a.shape, out_shape
_normalize_axis_tuple(axis, out_ndim, normalized_axis)
out_shape.assign(out_ndim, 0)
cdef Py_ssize_t i, j
for i in normalized_axis:
out_shape[i] = 1
j = 0
for i in range(out_ndim):
if out_shape[i] == 1:
continue
out_shape[i] = a_shape[j]
j += 1
return _reshape(a, out_shape)
cpdef _ndarray_base moveaxis(_ndarray_base a, source, destination):
cdef shape_t src, dest
cdef Py_ssize_t ndim = a.ndim
_normalize_axis_tuple(source, ndim, src)
_normalize_axis_tuple(destination, ndim, dest)
if src.size() != dest.size():
raise ValueError('`source` and `destination` arguments must have '
'the same number of elements')
cdef vector.vector[Py_ssize_t] order
cdef Py_ssize_t i
for i in range(ndim):
if not _has_element(src, i):
order.push_back(i)
cdef Py_ssize_t d, s
for d, s in sorted(zip(dest, src)):
order.insert(order.begin() + d, s)
return _transpose(a, order)
cpdef _ndarray_base _move_single_axis(
_ndarray_base a, Py_ssize_t source, Py_ssize_t destination):
"""Like moveaxis, but supporting only integer source and destination."""
cdef Py_ssize_t ndim = a.ndim
source = internal._normalize_axis_index(source, ndim)
destination = internal._normalize_axis_index(destination, ndim)
if source == destination:
return a
cdef vector.vector[Py_ssize_t] order
cdef Py_ssize_t i
for i in range(ndim):
if i != source:
order.push_back(i)
order.insert(order.begin() + destination, source)
return _transpose(a, order)
cpdef _ndarray_base rollaxis(
_ndarray_base a, Py_ssize_t axis, Py_ssize_t start=0):
cdef Py_ssize_t i, ndim = a.ndim
cdef vector.vector[Py_ssize_t] axes
if axis < 0:
axis += ndim
if start < 0:
start += ndim
if not (0 <= axis < ndim and 0 <= start <= ndim):
raise ValueError('Axis out of range')
if axis < start:
start -= 1
if axis == start:
return a
if ndim == 2:
return _transpose(a, axes)
for i in range(ndim):
axes.push_back(i)
axes.erase(axes.begin() + axis)
axes.insert(axes.begin() + start, axis)
return _transpose(a, axes)
cpdef _ndarray_base _reshape(_ndarray_base self, const shape_t &shape_spec):
cdef shape_t shape
cdef strides_t strides
cdef _ndarray_base newarray
shape = internal.infer_unknown_dimension(shape_spec, self.size)
if internal.vector_equal(shape, self._shape):
return self.view()
_get_strides_for_nocopy_reshape(self, shape, strides)
if strides.size() == shape.size():
return self._view(type(self), shape, strides, False, True, self)
newarray = self.copy()
_get_strides_for_nocopy_reshape(newarray, shape, strides)
# TODO(niboshi): Confirm update_x_contiguity flags
newarray._set_shape_and_strides(shape, strides, False, True)
return newarray
cpdef _ndarray_base _T(_ndarray_base self):
ret = self.view()
ret._shape.assign(self._shape.rbegin(), self._shape.rend())
ret._strides.assign(self._strides.rbegin(), self._strides.rend())
ret._c_contiguous = self._f_contiguous
ret._f_contiguous = self._c_contiguous
return ret
cpdef _ndarray_base _transpose(
_ndarray_base self, const vector.vector[Py_ssize_t] &axes):
cdef vector.vector[Py_ssize_t] a_axes
cdef vector.vector[char] axis_flags
cdef Py_ssize_t i, ndim, axis, axes_size
cdef bint is_normal = True, is_trans = True
axes_size = axes.size()
if axes_size == 0:
return _T(self)
ndim = self._shape.size()
if axes_size != ndim:
raise ValueError("axes don't match array")
axis_flags.resize(ndim, 0)
for i in range(axes_size):
axis = axes[i]
if axis < -ndim or axis >= ndim:
raise numpy.AxisError(axis, ndim)
axis %= ndim
a_axes.push_back(axis)
if axis_flags[axis]:
raise ValueError('repeated axis in transpose')
axis_flags[axis] = 1
is_normal &= i == axis
is_trans &= ndim - 1 - i == axis
if is_normal:
return self.view()
if is_trans:
return _T(self)
ret = self.view()
ret._shape.clear()
ret._strides.clear()
for axis in a_axes:
ret._shape.push_back(self._shape[axis])
ret._strides.push_back(self._strides[axis])
ret._update_contiguity()
return ret
cpdef array_split(_ndarray_base ary, indices_or_sections, Py_ssize_t axis):
cdef Py_ssize_t i, ndim, size, each_size, index, prev, stride
cdef Py_ssize_t num_large
cdef shape_t shape
ndim = ary.ndim
if -ndim > axis or ndim <= axis:
raise IndexError('Axis exceeds ndim')
if axis < 0:
axis += ndim
size = ary._shape[axis]
if numpy.isscalar(indices_or_sections):
each_size = (size - 1) // indices_or_sections
num_large = (size - 1) % indices_or_sections + 1
indices = [i * each_size + min(i, num_large)
for i in range(1, indices_or_sections)]
else:
indices = [i if i >= 0 else size + i for i in indices_or_sections]
if len(indices) == 0:
return [ary]
# Make a copy of shape for each view
shape = ary._shape
prev = 0
ret = []
stride = ary._strides[axis]
if ary.size == 0:
stride = 0
for index in indices:
index = min(index, size)
shape[axis] = max(index - prev, 0)
v = ary.view()
v.data = ary.data + prev * stride
# TODO(niboshi): Confirm update_x_contiguity flags
v._set_shape_and_strides(shape, ary._strides, True, True)
ret.append(v)
prev = index
shape[axis] = size - prev
v = ary.view()
v.data = ary.data + prev * stride
# TODO(niboshi): Confirm update_x_contiguity flags
v._set_shape_and_strides(shape, ary._strides, True, True)
ret.append(v)
return ret
cpdef _ndarray_base broadcast_to(_ndarray_base array, shape):
"""Broadcast an array to a given shape.
.. seealso::
:func:`cupy.broadcast_to` for full documentation,
:meth:`numpy.broadcast_to`
"""
shape = tuple(shape) if numpy.iterable(shape) else (shape,)
cdef int i, j, ndim = array._shape.size(), length = len(shape)
cdef Py_ssize_t sh, a_sh
if ndim > length:
raise ValueError(
'input operand has more dimensions than allowed by the axis '
'remapping')
cdef shape_t _shape = shape
cdef strides_t strides
strides.assign(length, 0)
for i in range(ndim):
j = i + length - ndim
sh = _shape[j]
a_sh = array._shape[i]
if sh == a_sh:
strides[j] = array._strides[i]
elif a_sh != 1:
raise ValueError(
'operands could not be broadcast together with shape {} and '
'requested shape {}'.format(array.shape, shape))
view = array.view()
# TODO(niboshi): Confirm update_x_contiguity flags
view._set_shape_and_strides(_shape, strides, True, True)
return view
cpdef _ndarray_base _repeat(_ndarray_base a, repeats, axis=None):
"""Repeat arrays along an axis.
Args:
a (cupy.ndarray): Array to transform.
repeats (int, list or tuple): The number of repeats.
axis (int): The axis to repeat.
Returns:
cupy.ndarray: Transformed array with repeats.
.. seealso:: :func:`numpy.repeat`
"""
cdef _ndarray_base ret
if isinstance(repeats, _ndarray_base):
raise ValueError(
'cupy.ndaray cannot be specified as `repeats` argument.')
# Scalar and size 1 'repeat' arrays broadcast to any shape, for all
# other inputs the dimension must match exactly.
cdef bint broadcast = False
# numpy.issubdtype(1, numpy.integer) fails with old numpy like 1.13.3.
if (isinstance(repeats, int) or
(hasattr(repeats, 'dtype') and
numpy.issubdtype(repeats, numpy.integer))):
if repeats < 0:
raise ValueError(
'\'repeats\' should not be negative: {}'.format(repeats))
broadcast = True
repeats = [repeats]
elif cpython.PySequence_Check(repeats):
for rep in repeats:
if rep < 0:
raise ValueError(
'all elements of \'repeats\' should not be negative: {}'
.format(repeats))
if len(repeats) == 1:
broadcast = True
else:
raise ValueError(
'\'repeats\' should be int or sequence: {}'.format(repeats))
if axis is None:
if broadcast:
a = _reshape(a, (-1, 1))
ret = core.ndarray((a.size, repeats[0]), dtype=a.dtype)
if ret.size:
elementwise_copy(a, ret)
return ret.ravel()
else:
a = a.ravel()
axis = 0
else:
axis = internal._normalize_axis_index(axis, a.ndim)
if broadcast:
repeats = repeats * a._shape[axis]
elif a.shape[axis] != len(repeats):
raise ValueError(
'\'repeats\' and \'axis\' of \'a\' should be same length: {} != {}'
.format(a.shape[axis], len(repeats)))
ret_shape = list(a.shape)
ret_shape[axis] = sum(repeats)
ret = core.ndarray(ret_shape, dtype=a.dtype)
a_index = [slice(None)] * len(ret_shape)
ret_index = list(a_index)
offset = 0
for i in range(a._shape[axis]):
if repeats[i] == 0:
continue
a_index[axis] = slice(i, i + 1)
ret_index[axis] = slice(offset, offset + repeats[i])
# convert to tuple because cupy has a indexing bug
ret[tuple(ret_index)] = a[tuple(a_index)]
offset += repeats[i]
return ret
cpdef _ndarray_base concatenate_method(
tup, int axis, _ndarray_base out=None, dtype=None,
casting='same_kind'):
cdef int ndim0
cdef int i
cdef _ndarray_base a, a0
if dtype is not None:
dtype = get_dtype(dtype)
dev_id = device.get_device_id()
arrays = _preprocess_args(dev_id, tup, False)
# Check if the input is not an empty sequence
if len(arrays) == 0:
raise ValueError('Cannot concatenate from empty tuple')
# Check types of the input arrays
for o in arrays:
if not isinstance(o, _ndarray_base):
raise TypeError('Only cupy arrays can be concatenated')
# Check ndim > 0 for the input arrays
for o in arrays:
a = o
if a._shape.size() == 0:
raise TypeError('zero-dimensional arrays cannot be concatenated')
# Check ndim consistency of the input arrays
a0 = arrays[0]
ndim0 = a0._shape.size()
for o in arrays[1:]:
a = o
if a._shape.size() != ndim0:
raise ValueError(
'All arrays to concatenate must have the same ndim')
# Check shape consistency of the input arrays, and compute the output shape
shape0 = a0._shape
axis = internal._normalize_axis_index(axis, ndim0)
for o in arrays[1:]:
a = o
for i in range(ndim0):
if i != axis and shape0[i] != a._shape[i]:
raise ValueError(
'All arrays must have same shape except the axis to '
'concatenate')
shape0[axis] += a._shape[axis]
# Compute the output dtype
if out is None:
if dtype is None:
dtype = a0.dtype
have_same_types = True
for o in arrays[1:]:
have_same_types = have_same_types and (o.dtype == dtype)
if not have_same_types:
dtype = functools.reduce(
numpy.promote_types, set([a.dtype for a in arrays]))
else:
if dtype is not None:
raise TypeError('concatenate() only takes `out` or `dtype` as an '
'argument, but both were provided.')
dtype = out.dtype
# Check casting rule
for o in arrays:
_raise_if_invalid_cast(o.dtype, dtype, casting)
# Prpare the output array
shape_t = tuple(shape0)
if out is None:
out = core.ndarray(shape_t, dtype=dtype)
else:
if len(out.shape) != len(shape_t):
raise ValueError('Output array has wrong dimensionality')
if out.shape != shape_t:
raise ValueError('Output array is the wrong shape')
return _concatenate(arrays, axis, shape_t, out, casting)
cpdef _ndarray_base _concatenate(
list arrays, Py_ssize_t axis, tuple shape, _ndarray_base out,
str casting):
cdef _ndarray_base a, b
cdef Py_ssize_t i, aw, itemsize, axis_size
cdef bint all_same_type, same_shape_and_contiguous
# If arrays are large, Issuing each copy method is efficient.
cdef Py_ssize_t threshold_size = 2 * 1024 * 1024
dtype = out.dtype
if len(arrays) > 8:
all_same_type = True
same_shape_and_contiguous = True
axis_size = shape[axis] // len(arrays)
total_bytes = 0
itemsize = dtype.itemsize
for a in arrays:
if a.dtype != dtype:
all_same_type = False
break
if same_shape_and_contiguous:
same_shape_and_contiguous = (
a._c_contiguous and a._shape[axis] == axis_size)
total_bytes += a.size * itemsize
if all_same_type and total_bytes < threshold_size * len(arrays):
return _concatenate_single_kernel(
arrays, axis, shape, dtype, same_shape_and_contiguous, out)
i = 0
slice_list = [slice(None)] * len(shape)
for a in arrays:
aw = a._shape[axis]
slice_list[axis] = slice(i, i + aw)
b = out[tuple(slice_list)]
elementwise_copy(a, b, casting=casting)
i += aw
return out
cpdef Py_ssize_t size(_ndarray_base a, axis=None) except? -1:
"""Returns the number of elements along a given axis.
Args:
a (ndarray): Input data.
axis (int or None): Axis along which the elements are counted.
When it is ``None``, it returns the total number of elements.
Returns:
int: Number of elements along the given axis.
"""
cdef int index, ndim
if axis is None:
return a.size
else:
index = axis
ndim = a._shape.size()
if index < 0:
index += ndim
if not 0 <= index < ndim:
raise IndexError('index out of range')
return a._shape[index]
# private
cdef bint _has_element(const shape_t &source, Py_ssize_t n):
for i in range(source.size()):
if source[i] == n:
return True
return False
cdef _get_strides_for_nocopy_reshape(
_ndarray_base a, const shape_t &newshape, strides_t &newstrides):
cdef Py_ssize_t size, itemsize, ndim, dim, last_stride
size = a.size
newstrides.clear()
itemsize = a.itemsize
if size == 1:
newstrides.assign(<Py_ssize_t>newshape.size(), itemsize)
return
if size == 0:
internal.get_contiguous_strides_inplace(
newshape, newstrides, itemsize, True, False)
return
cdef shape_t shape
cdef strides_t strides
internal.get_reduced_dims(a._shape, a._strides, itemsize, shape, strides)
ndim = shape.size()
dim = 0
last_stride = shape[0] * strides[0]
for i in range(newshape.size()):
size = newshape[i]
if size <= 1:
newstrides.push_back(last_stride)
continue
if dim >= ndim or shape[dim] % size != 0:
newstrides.clear()
break
shape[dim] //= size
last_stride = shape[dim] * strides[dim]
newstrides.push_back(last_stride)
if shape[dim] == 1:
dim += 1
cdef _normalize_axis_tuple(axis, Py_ssize_t ndim, shape_t &ret):
"""Normalizes an axis argument into a tuple of non-negative integer axes.
Arguments `argname` and `allow_duplicate` are not supported.
"""
if numpy.isscalar(axis):
axis = (axis,)
for ax in axis:
ax = internal._normalize_axis_index(ax, ndim)
if _has_element(ret, ax):
# the message in `numpy.core.numeric.normalize_axis_tuple`
raise ValueError('repeated axis')
ret.push_back(ax)
cdef _ndarray_base _concatenate_single_kernel(
list arrays, Py_ssize_t axis, tuple shape, dtype,
bint same_shape_and_contiguous, _ndarray_base out):
cdef _ndarray_base a, x
cdef Py_ssize_t base, cum, ndim
cdef int i, j
cdef Py_ssize_t[:] ptrs
cdef Py_ssize_t[:] cum_sizes
cdef Py_ssize_t[:, :] x_strides
cdef int device_id = device.get_device_id()
assert out is not None
ptrs = numpy.ndarray(len(arrays), numpy.int64)
for i, a in enumerate(arrays):
_check_peer_access(a, device_id)
ptrs[i] = a.data.ptr
x = core.array(ptrs)
if same_shape_and_contiguous:
base = internal.prod_sequence(shape[axis:]) // len(arrays)
_concatenate_kernel_same_size(x, base, out)
return out
ndim = len(shape)
x_strides = numpy.ndarray((len(arrays), ndim), numpy.int64)
cum_sizes = numpy.ndarray(len(arrays), numpy.int64)
cum = 0
for i, a in enumerate(arrays):
for j in range(ndim):
x_strides[i, j] = <int>a._strides[j]
cum_sizes[i] = cum
cum += <int>a._shape[axis]
_concatenate_kernel(
x, axis, core.array(cum_sizes), core.array(x_strides), out)
return out
cdef _concatenate_kernel_same_size = ElementwiseKernel(
'raw P x, int64 base',
'T y',
'''
ptrdiff_t middle = i / base;
ptrdiff_t top = middle / x.size();
ptrdiff_t array_ind = middle - top * x.size();
ptrdiff_t offset = i + (top - middle) * base;
y = reinterpret_cast<T*>(x[array_ind])[offset];
''',
'cupy_concatenate_same_size'
)
cdef _concatenate_kernel = ElementwiseKernel(
'''raw P x, int32 axis, raw int64 cum_sizes, raw int64 x_strides''',
'T y',
'''
ptrdiff_t axis_ind = _ind.get()[axis];
ptrdiff_t left = 0;
ptrdiff_t right = cum_sizes.size();
while (left < right - 1) {
ptrdiff_t m = (left + right) / 2;
if (axis_ind < cum_sizes[m]) {
right = m;
} else {
left = m;
}
}
ptrdiff_t array_ind = left;
axis_ind -= cum_sizes[left];
char* ptr = reinterpret_cast<char*>(x[array_ind]);
for (int j = _ind.ndim - 1; j >= 0; --j) {
ptrdiff_t ind[] = {array_ind, j};
ptrdiff_t offset;
if (j == axis) {
offset = axis_ind;
} else {
offset = _ind.get()[j];
}
ptr += x_strides[ind] * offset;
}
y = *reinterpret_cast<T*>(ptr);
''',
'cupy_concatenate',
reduce_dims=False
)
from cupy._core.core cimport _ndarray_base
cdef _ndarray_base _ndarray_conj(_ndarray_base self)
cdef _ndarray_base _ndarray_real_getter(_ndarray_base self)
cdef _ndarray_base _ndarray_real_setter(_ndarray_base self, value)
cdef _ndarray_base _ndarray_imag_getter(_ndarray_base self)
cdef _ndarray_base _ndarray_imag_setter(_ndarray_base self, value)
cdef _ndarray_base _ndarray_prod(
_ndarray_base self, axis, dtype, out, keepdims)
cdef _ndarray_base _ndarray_sum(_ndarray_base self, axis, dtype, out, keepdims)
cdef _ndarray_base _ndarray_cumsum(_ndarray_base self, axis, dtype, out)
cdef _ndarray_base _ndarray_cumprod(_ndarray_base self, axis, dtype, out)
cdef _ndarray_base _ndarray_clip(_ndarray_base self, a_min, a_max, out)
cpdef _ndarray_base _nansum(_ndarray_base a, axis, dtype, out, keepdims)
cpdef _ndarray_base _nanprod(_ndarray_base a, axis, dtype, out, keepdims)
cpdef enum scan_op:
SCAN_SUM = 0
SCAN_PROD = 1
cdef _ndarray_base scan(_ndarray_base a, op, dtype=*, _ndarray_base out=*,
incomplete=*, chunk_size=*)
cdef object _sum_auto_dtype
cdef object _add
cdef object _conj
cdef object _angle
cdef object _positive
cdef object _negative
cdef object _multiply
cdef object _divide
cdef object _power
cdef object _subtract
cdef object _true_divide
cdef object _floor_divide
cdef object _remainder
cdef object _absolute
cdef object _sqrt
import string
import numpy
import cupy
from cupy._core._reduction import create_reduction_func
from cupy._core._kernel import create_ufunc, _get_warpsize
from cupy._core._scalar import get_typename
from cupy._core._ufuncs import elementwise_copy
import cupy._core.core as core
from cupy._core cimport internal
from cupy import _util
from cupy_backends.cuda.api cimport runtime
from cupy._core cimport _accelerator
from cupy._core._dtype cimport get_dtype
from cupy._core.core cimport _ndarray_init
from cupy._core.core cimport compile_with_cache
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport memory
from cupy.cuda import cub
try:
import cupy_backends.cuda.libs.cutensor as cuda_cutensor
except ImportError:
cuda_cutensor = None
# _ndarray_base members
cdef _ndarray_base _ndarray_conj(_ndarray_base self):
if self.dtype.kind == 'c':
return _conjugate(self)
else:
return self
cdef _ndarray_base _ndarray_real_getter(_ndarray_base self):
if self.dtype.kind == 'c':
dtype = get_dtype(self.dtype.char.lower())
view = core.ndarray.__new__(
type(self), shape=self._shape, dtype=dtype, _obj=self,
memptr=self.data, strides=self._strides)
(<_ndarray_base>view).base = (
self.base if self.base is not None else self)
return view
return self
cdef _ndarray_base _ndarray_real_setter(_ndarray_base self, value):
elementwise_copy(value, _ndarray_real_getter(self))
cdef _ndarray_base _ndarray_imag_getter(_ndarray_base self):
cdef memory.MemoryPointer memptr
if self.dtype.kind == 'c':
dtype = get_dtype(self.dtype.char.lower())
memptr = self.data
# Make the memory pointer point to the first imaginary element.
# Note that even if the array doesn't have a valid memory (e.g. 0-size
# array), the resulting array should be a view of the original array,
# aligning with NumPy behavior.
if memptr.ptr != 0:
memptr = memptr + self.dtype.itemsize // 2
view = core.ndarray.__new__(
type(self), shape=self._shape, dtype=dtype, memptr=memptr,
strides=self._strides)
(<_ndarray_base>view).base = (
self.base if self.base is not None else self)
return view
new_array = core.ndarray.__new__(type(self), self.shape, dtype=self.dtype)
new_array.fill(0)
return new_array
cdef _ndarray_base _ndarray_imag_setter(_ndarray_base self, value):
if self.dtype.kind == 'c':
elementwise_copy(value, _ndarray_imag_getter(self))
else:
raise TypeError('cupy.ndarray does not have imaginary part to set')
cdef _ndarray_base _ndarray_prod(
_ndarray_base self, axis, dtype, out, keepdims):
for accelerator in _accelerator._routine_accelerators:
result = None
if accelerator == _accelerator.ACCELERATOR_CUB:
# result will be None if the reduction is not compatible with CUB
result = cub.cub_reduction(
self, cub.CUPY_CUB_PROD, axis, dtype, out, keepdims)
if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
cuda_cutensor is not None):
from cupyx import cutensor
result = cutensor._try_reduction_routine(
self, axis, dtype, out, keepdims, cuda_cutensor.OP_MUL, 1, 0)
if result is not None:
return result
if dtype is None:
return _prod_auto_dtype(self, axis, dtype, out, keepdims)
else:
return _prod_keep_dtype(self, axis, dtype, out, keepdims)
cdef _ndarray_base _ndarray_sum(
_ndarray_base self, axis, dtype, out, keepdims):
for accelerator in _accelerator._routine_accelerators:
result = None
if accelerator == _accelerator.ACCELERATOR_CUB:
# result will be None if the reduction is not compatible with CUB
result = cub.cub_reduction(
self, cub.CUPY_CUB_SUM, axis, dtype, out, keepdims)
if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
cuda_cutensor is not None):
from cupyx import cutensor
result = cutensor._try_reduction_routine(
self, axis, dtype, out, keepdims, cuda_cutensor.OP_ADD, 1, 0)
if result is not None:
return result
if dtype is None:
return _sum_auto_dtype(self, axis, dtype, out, keepdims)
else:
return _sum_keep_dtype(self, axis, dtype, out, keepdims)
cdef _ndarray_base _ndarray_cumsum(_ndarray_base self, axis, dtype, out):
return cupy.cumsum(self, axis, dtype, out)
cdef _ndarray_base _ndarray_cumprod(_ndarray_base self, axis, dtype, out):
return cupy.cumprod(self, axis, dtype, out)
cdef _ndarray_base _ndarray_clip(_ndarray_base self, a_min, a_max, out):
if a_min is None and a_max is None:
raise ValueError('array_clip: must set either max or min')
kind = self.dtype.kind
if a_min is None:
if kind == 'f':
a_min = self.dtype.type('-inf')
elif kind in 'iu':
a_min = numpy.iinfo(self.dtype.type).min
if a_max is None:
if kind == 'f':
a_max = self.dtype.type('inf')
elif kind in 'iu':
a_max = numpy.iinfo(self.dtype.type).max
return _clip(self, a_min, a_max, out=out)
# private/internal
_op_char = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'}
_identity = {scan_op.SCAN_SUM: 0, scan_op.SCAN_PROD: 1}
@cupy._util.memoize(for_each_device=True)
def _cupy_bsum_shfl(op, chunk_size, warp_size=32):
"""Returns a kernel that computes the sum/prod of each thread-block.
Args:
op (int): Operation type. SCAN_SUM or SCAN_PROD.
chunk_size (int): Number of array elements processed by a single
thread-block.
warp_size (int); Warp size.
Returns:
cupy.ElementwiseKernel
Example:
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
_cupy_bsum(op=SCAN_SUM, chunk_size=4)(a, b, ...)
b == [10, 26, 19]
Note:
This uses warp shuffle functions to exchange data in a warp.
See the link below for details about warp shuffle functions.
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
"""
block_size = chunk_size // 2 # each thread handles two elements
in_params = 'raw T a'
out_params = 'raw O b'
loop_prep = string.Template("""
__shared__ O smem[${block_size} / ${warp_size}];
const int n_warp = ${block_size} / ${warp_size};
const int warp_id = threadIdx.x / ${warp_size};
const int lane_id = threadIdx.x % ${warp_size};
""").substitute(block_size=block_size, warp_size=warp_size)
loop_body = string.Template("""
O x = ${identity};
if (2*i < a.size()) x = a[2*i];
if (2*i + 1 < a.size()) x ${op}= a[2*i + 1];
for (int j = 1; j < ${warp_size}; j *= 2) {
x ${op}= __shfl_xor_sync(0xffffffff, x, j, ${warp_size});
}
if (lane_id == 0) smem[warp_id] = x;
__syncthreads();
if (warp_id == 0) {
x = ${identity};
if (lane_id < n_warp) x = smem[lane_id];
for (int j = 1; j < n_warp; j *= 2) {
x ${op}= __shfl_xor_sync(0xffffffff, x, j, ${warp_size});
}
int block_id = i / ${block_size};
if (lane_id == 0) b[block_id] = x;
}
""").substitute(block_size=block_size, warp_size=warp_size,
op=_op_char[op], identity=_identity[op])
return cupy.ElementwiseKernel(in_params, out_params, loop_body,
'cupy_bsum_shfl', loop_prep=loop_prep)
@cupy._util.memoize(for_each_device=True)
def _cupy_bsum_smem(op, chunk_size, warp_size=32):
"""Returns a kernel that computes the sum/prod of each thread-block.
Args:
op (int): Operation type. SCAN_SUM or SCAN_PROD.
chunk_size (int): Number of array elements processed by a single
thread-block.
warp_size (int); Warp size.
Returns:
cupy.ElementwiseKernel
Example:
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
_cupy_bsum(op=SCAN_SUM, chunk_size=4)(a, b, ...)
b == [10, 26, 19]
Note:
This uses shared memory to exchange data in a warp.
"""
block_size = chunk_size // 2 # each thread handles two elements
in_params = 'raw T a'
out_params = 'raw O b'
loop_prep = string.Template("""
__shared__ O smem1[${block_size}];
__shared__ O smem2[${warp_size}];
const int n_warp = ${block_size} / ${warp_size};
const int warp_id = threadIdx.x / ${warp_size};
const int lane_id = threadIdx.x % ${warp_size};
""").substitute(block_size=block_size, warp_size=warp_size)
loop_body = string.Template("""
O x = ${identity};
if (2*i < a.size()) x = a[2*i];
if (2*i + 1 < a.size()) x ${op}= a[2*i + 1];
for (int j = 1; j < ${warp_size}; j *= 2) {
smem1[threadIdx.x] = x; __syncwarp();
x ${op}= smem1[threadIdx.x ^ j]; __syncwarp();
}
if (lane_id == 0) smem2[warp_id] = x;
__syncthreads();
if (warp_id == 0) {
x = ${identity};
if (lane_id < n_warp) x = smem2[lane_id];
for (int j = 1; j < n_warp; j *= 2) {
smem2[lane_id] = x; __syncwarp();
x ${op}= smem2[lane_id ^ j]; __syncwarp();
}
int block_id = i / ${block_size};
if (lane_id == 0) b[block_id] = x;
}
""").substitute(block_size=block_size, warp_size=warp_size,
op=_op_char[op], identity=_identity[op])
return cupy.ElementwiseKernel(in_params, out_params, loop_body,
'cupy_bsum_smem', loop_prep=loop_prep)
@cupy._util.memoize(for_each_device=True)
def _cupy_scan_naive(op, chunk_size, warp_size=32):
"""Returns a kernel to compute an inclusive scan.
It first performs an inclusive scan in each thread-block and then add the
scan results for the sum/prod of the chunks.
Args:
op (int): Operation type. SCAN_SUM or SCAN_PROD.
chunk_size (int): Number of array elements processed by a single
thread-block.
warp_size (int); Warp size.
Returns:
cupy.ElementwiseKernel
Example:
b = [10, 36, 55]
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
_cupy_scan(op=SCAN_SUM, chunk_size=4)(b, a, out, ...)
out == [1, 3, 6, 10, 15, 21, 28, 36, 45, 55]
Note:
This uses a kind of method called "Naive Parallel Scan" for inclusive
scan in each thread-block. See below for details about it.
https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
"""
in_params = 'raw O b'
out_params = 'raw T a, raw O out'
loop_prep = string.Template("""
__shared__ O smem1[${block_size}];
__shared__ O smem2[${warp_size}];
const int n_warp = ${block_size} / ${warp_size};
const int warp_id = threadIdx.x / ${warp_size};
const int lane_id = threadIdx.x % ${warp_size};
""").substitute(block_size=chunk_size, warp_size=warp_size)
loop_body = string.Template("""
O x = ${identity};
if (i < a.size()) x = a[i];
for (int j = 1; j < ${warp_size}; j *= 2) {
smem1[threadIdx.x] = x; __syncwarp();
if (lane_id - j >= 0) x ${op}= smem1[threadIdx.x - j];
__syncwarp();
}
if (lane_id == ${warp_size} - 1) smem2[warp_id] = x;
__syncthreads();
if (warp_id == 0) {
O y = ${identity};
if (lane_id < n_warp) y = smem2[lane_id];
for (int j = 1; j < n_warp; j *= 2) {
smem2[lane_id] = y; __syncwarp();
if (lane_id - j >= 0) y ${op}= smem2[lane_id - j];
__syncwarp();
}
smem2[lane_id] = y;
}
__syncthreads();
if (warp_id > 0) x ${op}= smem2[warp_id - 1];
int block_id = i / ${block_size};
if (block_id > 0) x ${op}= b[block_id - 1];
if (i < a.size()) out[i] = x;
""").substitute(block_size=chunk_size, warp_size=warp_size,
op=_op_char[op], identity=_identity[op])
return cupy.ElementwiseKernel(in_params, out_params, loop_body,
'cupy_scan_naive', loop_prep=loop_prep)
@cupy._util.memoize(for_each_device=True)
def _cupy_scan_btree(op, chunk_size, warp_size=32):
"""Returns a kernel to compute an inclusive scan.
It first performs an inclusive scan in each thread-block and then add the
scan results for the sum/prod of the chunks.
Args:
op (int): Operation type. SCAN_SUM or SCAN_PROD.
chunk_size (int): Number of array elements processed by a single
thread-block.
warp_size (int); Warp size.
Returns:
cupy.ElementwiseKernel
Example:
b = [10, 36, 55]
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
_cupy_scan(op=SCAN_SUM, chunk_size=4)(b, a, out, ...)
out == [1, 3, 6, 10, 15, 21, 28, 36, 45, 55]
Note:
This uses a kind of method called "Work-Efficient Parallel Scan" for
inclusive scan in each thread-block. See below link for details about
"Work-Efficient Parallel Scan".
https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
"""
in_params = 'raw O b'
out_params = 'raw T a, raw O out'
loop_prep = string.Template("""
__shared__ O smem0[${block_size} + 1];
O *smem1 = smem0 + 1;
__shared__ O smem2[${warp_size}];
const int n_warp = ${block_size} / ${warp_size};
const int warp_id = threadIdx.x / ${warp_size};
const int lane_id = threadIdx.x % ${warp_size};
if (threadIdx.x == 0) smem0[0] = ${identity};
""").substitute(block_size=chunk_size, warp_size=warp_size,
identity=_identity[op])
loop_body = string.Template("""
O x = ${identity};
if (i < a.size()) x = a[i];
for (int j = 1; j < ${warp_size}; j *= 2) {
smem1[threadIdx.x] = x; __syncwarp();
if (lane_id % (2*j) == (2*j)-1) {
x ${op}= smem1[threadIdx.x - j];
}
__syncwarp();
}
smem1[threadIdx.x] = x;
__syncthreads();
if (warp_id == 0) {
O y = ${identity};
if (lane_id < n_warp) {
y = smem0[${warp_size} * (lane_id + 1)];
}
for (int j = 1; j < n_warp; j *= 2) {
smem2[lane_id] = y; __syncwarp();
if (lane_id % (2*j) == (2*j)-1) {
y ${op}= smem2[lane_id - j];
}
__syncwarp();
}
for (int j = n_warp / 4; j > 0; j /= 2) {
smem2[lane_id] = y; __syncwarp();
if ((lane_id % (2*j) == j-1) && (lane_id >= 2*j)) {
y ${op}= smem2[lane_id - j];
}
__syncwarp();
}
if (lane_id < n_warp) {
smem0[${warp_size} * (lane_id + 1)] = y;
}
}
__syncthreads();
x = smem0[threadIdx.x];
for (int j = ${warp_size} / 2; j > 0; j /= 2) {
if (lane_id % (2*j) == j) {
x ${op}= smem0[threadIdx.x - j];
}
__syncwarp();
smem0[threadIdx.x] = x; __syncwarp();
}
__syncthreads();
x = smem1[threadIdx.x];
int block_id = i / ${block_size};
if (block_id > 0) x ${op}= b[block_id - 1];
if (i < a.size()) out[i] = x;
""").substitute(block_size=chunk_size, warp_size=warp_size,
op=_op_char[op], identity=_identity[op])
return cupy.ElementwiseKernel(in_params, out_params, loop_body,
'cupy_scan_btree', loop_prep=loop_prep)
cdef _ndarray_base scan(
_ndarray_base a, op, dtype=None, _ndarray_base out=None,
incomplete=False, chunk_size=512):
"""Return the prefix sum(scan) of the elements.
Args:
a (cupy.ndarray): input array.
out (cupy.ndarray): Alternative output array in which to place
the result. The same size and same type as the input array(a).
Returns:
cupy.ndarray: A new array holding the result is returned.
"""
if a._shape.size() != 1:
raise TypeError('Input array should be 1D array.')
if out is None:
if dtype is None:
dtype = a.dtype
if not incomplete:
out = _ndarray_init(cupy.ndarray, a._shape, dtype, None)
else:
if a.size != out.size:
raise ValueError('Provided out is the wrong size')
dtype = out.dtype
dtype = numpy.dtype(dtype)
warp_size = _get_warpsize()
if runtime._is_hip_environment:
if dtype.char in 'iIfdlq':
# On HIP, __shfl* supports int, unsigned int, float, double,
# long, and long long. The documentation is too outdated and
# unreliable; refer to the header at
# $ROCM_HOME/include/hip/hcc_detail/device_functions.h
bsum_kernel = _cupy_bsum_shfl(op, chunk_size, warp_size)
else:
bsum_kernel = _cupy_bsum_smem(op, chunk_size, warp_size)
else:
if dtype.char in 'iIlLqQfd':
bsum_kernel = _cupy_bsum_shfl(op, chunk_size, warp_size)
else:
bsum_kernel = _cupy_bsum_smem(op, chunk_size, warp_size)
if dtype.char in 'fdFD':
scan_kernel = _cupy_scan_btree(op, chunk_size, warp_size)
else:
scan_kernel = _cupy_scan_naive(op, chunk_size, warp_size)
b_size = (a.size + chunk_size - 1) // chunk_size
b = cupy.empty((b_size,), dtype=dtype)
size = b.size * chunk_size
if a.size > chunk_size:
bsum_kernel(a, b, size=size // 2, block_size=chunk_size // 2)
scan(b, op, dtype=dtype, out=b)
if incomplete:
return b
scan_kernel(b, a, out, size=size, block_size=chunk_size)
else:
scan_kernel(b, a, out, size=size, block_size=chunk_size)
return out
@_util.memoize(for_each_device=True)
def _inclusive_batch_scan_kernel(
dtype, block_size, op, src_c_cont, out_c_cont):
"""return Prefix Sum(Scan) cuda kernel
for a 2d array over axis 1
used for scanning over different axes
e.g
if blocksize > len(src[0])
src [[1, 2, 3, 4],
[5, 6, 7, 8]]
dst [[1, 3, 6, 10],
[5, 11, 18, 26]]
if blocksize < len(src[0])
block_size: 2
# TODO show partialness
src [[1, 2, 3, 4],
[5, 6, 7, 8]]
dst [[1, 3, 3, 7],
[5, 11, 7, 15]]
Args:
dtype: src, dst array type
block_size: block_size
Returns:
cupy.cuda.Function: cuda function
"""
op_char = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'}
identity = {scan_op.SCAN_SUM: 0, scan_op.SCAN_PROD: 1}
name = 'cupy_inclusive_batch_scan_kernel'
dtype = get_typename(dtype)
source = string.Template("""
extern "C" __global__ void ${name}(
const CArray<${dtype}, 2, ${src_c_cont}> src,
CArray<${dtype}, 2, ${out_c_cont}> dst, int batch_size){
long long n = src.size();
extern __shared__ ${dtype} temp[];
unsigned int thid = threadIdx.x;
unsigned int block = blockIdx.x * blockDim.x;
unsigned int pad_batch_size = batch_size;
bool must_copy = true;
if (batch_size & (batch_size -1)) {
pad_batch_size = 1 << (32 - __clz(batch_size));
must_copy = (thid & (pad_batch_size-1)) < batch_size;
}
if (pad_batch_size > ${block_size}) {
int blocks_per_batch = (batch_size - 1) / ${block_size} + 1;
pad_batch_size = ${block_size} * blocks_per_batch;
// Must copy enables for all blocks but the last one in the batch
bool last_block = (blockIdx.x + 1) % blocks_per_batch == 0;
int remaining_batch = batch_size % ${block_size};
if (remaining_batch == 0) {
remaining_batch = ${block_size};
}
must_copy = !last_block || (thid < (remaining_batch));
}
int pad_per_batch = pad_batch_size-batch_size;
int n_batches_block = ${block_size} / pad_batch_size;
unsigned int idx0 = thid + block;
int batch_id = idx0 / pad_batch_size;
idx0 = idx0 - pad_per_batch * batch_id;
int row = idx0 / batch_size;
int col = idx0 % batch_size;
const ptrdiff_t idx0_idx[] = {row, col};
if(idx0 < n){
temp[thid] = (must_copy) ? src[idx0_idx] : (${dtype}) ${identity};
__syncthreads();
if (!n_batches_block) {
n_batches_block = 1;
pad_batch_size = ${block_size};
}
for (int j = 0; j < n_batches_block; j++) {
int offset = j * pad_batch_size;
for (int i = 1; i <= pad_batch_size; i <<= 1) {
int index = ((threadIdx.x + 1) * 2 * i - 1);
int index_block = offset + index;
if (index < (pad_batch_size)){
temp[index_block] ${op}= temp[index_block - i];
}
__syncthreads();
}
for(int i = pad_batch_size >> 1; i > 0; i >>= 1){
int index = ((threadIdx.x + 1) * 2 * i - 1);
int index_block = offset + index;
if((index + i) < (pad_batch_size)){
temp[index_block + i] ${op}= temp[index_block];
}
__syncthreads();
}
}
if(must_copy){
dst[idx0_idx] = temp[thid];
}
}
}
""").substitute(name=name, dtype=dtype, block_size=block_size,
op=op_char[op], identity=identity[op],
src_c_cont=src_c_cont, out_c_cont=out_c_cont)
module = compile_with_cache(source)
return module.get_function(name)
@_util.memoize(for_each_device=True)
def _add_scan_batch_blocked_sum_kernel(dtype, op, block_size, c_cont):
name = 'cupy_add_scan_blocked_sum_kernel'
dtype = get_typename(dtype)
ops = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'}
source = string.Template("""
extern "C" __global__ void ${name}(CArray<${dtype}, 2, ${c_cont}> src_dst,
int batch_size){
long long n = src_dst.size();
unsigned int thid = threadIdx.x;
unsigned int block = blockIdx.x * ${block_size};
unsigned int idx0 = thid + block;
// Respect padding
unsigned int row = idx0 / batch_size;
unsigned int col = idx0 % batch_size;
int my_block = ${block_size} * (col / ${block_size});
const ptrdiff_t dst_idx[] = {row, col};
const ptrdiff_t src_idx[] = {row, my_block - 1};
// Avoid for the first block of every row
// This can be tweaked with kernel launch settings
bool first = col < ${block_size};
bool is_block = (col % (${block_size})) == ${block_size} - 1;
if(idx0 < n && !first && !is_block){
src_dst[dst_idx] ${op}= src_dst[src_idx];
}
}
""").substitute(name=name, dtype=dtype, op=ops[op], block_size=block_size,
c_cont=c_cont)
module = compile_with_cache(source)
return module.get_function(name)
cdef _ndarray_base _batch_scan_op(
_ndarray_base a, scan_op op, _ndarray_base out):
batch_size = a.shape[1]
# TODO(ecastill) replace this with "_reduction._block_size" once it is
# properly exposed
block_size = 512
# Since we need to pad each batch we spawn more threads as some
# of them will be idle
# Calc the total number of blocks
padded_bs = 1 << ((batch_size - 1).bit_length())
if padded_bs > block_size:
blocks_per_batch = (batch_size - 1) // block_size + 1
padded_bs = block_size * blocks_per_batch
padded_size = a.size // batch_size * padded_bs
cdef int src_cont = int(a.flags.c_contiguous)
cdef int out_cont = int(out.flags.c_contiguous)
kern_scan = _inclusive_batch_scan_kernel(a.dtype, block_size, op,
src_cont, out_cont)
kern_scan(grid=((padded_size - 1) // (block_size) + 1,),
block=(block_size,),
args=(a, out, batch_size),
shared_mem=a.itemsize * block_size)
if batch_size > block_size:
blocked_sum = out[:, block_size-1::block_size]
_batch_scan_op(blocked_sum, op, blocked_sum)
kern_add = _add_scan_batch_blocked_sum_kernel(
out.dtype, op, block_size, out_cont)
kern_add(
grid=((out.size - 1) // (block_size) + 1,),
block=(block_size,),
args=(out, batch_size))
return out
cdef _proc_as_batch(_ndarray_base x, int axis, scan_op op):
if x.shape[axis] == 0:
return cupy.empty_like(x)
t = cupy.rollaxis(x, axis, x.ndim)
s = t.shape
r = t.reshape(-1, x.shape[axis])
_batch_scan_op(r, op, r)
return cupy.rollaxis(r.reshape(s), x.ndim-1, axis)
cpdef scan_core(
_ndarray_base a, axis, scan_op op, dtype=None, _ndarray_base out=None):
if out is None:
if dtype is None:
kind = a.dtype.kind
if kind == 'b':
dtype = numpy.dtype('l')
elif kind == 'i' and a.dtype.itemsize < numpy.dtype('l').itemsize:
dtype = numpy.dtype('l')
elif kind == 'u' and a.dtype.itemsize < numpy.dtype('L').itemsize:
dtype = numpy.dtype('L')
else:
dtype = a.dtype
result = None
else:
if (out.flags.c_contiguous or out.flags.f_contiguous):
result = out
elementwise_copy(a, result)
else:
result = a.astype(out.dtype, order='C')
if axis is None:
for accelerator in _accelerator._routine_accelerators:
if accelerator == _accelerator.ACCELERATOR_CUB:
if result is None:
result = a.astype(dtype, order='C').ravel()
# result will be None if the scan is not compatible with CUB
if op == scan_op.SCAN_SUM:
cub_op = cub.CUPY_CUB_CUMSUM
else:
cub_op = cub.CUPY_CUB_CUMPROD
res = cub.cub_scan(result, cub_op)
if res is not None:
break
else:
if result is None:
result = scan(a.ravel(), op, dtype=dtype)
else:
scan(result, op, dtype=dtype, out=result)
else:
if result is None:
result = a.astype(dtype, order='C')
axis = internal._normalize_axis_index(axis, a.ndim)
result = _proc_as_batch(result, axis, op)
# This is for when the original out param was not contiguous
if out is not None and out.data != result.data:
elementwise_copy(result.reshape(out.shape), out)
else:
out = result
return out
# Only for test
def _scan_for_test(a, out=None):
return scan(a, scan_op.SCAN_SUM, dtype=None, out=out)
cpdef _ndarray_base _nansum(_ndarray_base a, axis, dtype, out, keepdims):
if cupy.iscomplexobj(a):
return _nansum_complex_dtype(a, axis, dtype, out, keepdims)
elif dtype is None:
return _nansum_auto_dtype(a, axis, dtype, out, keepdims)
else:
return _nansum_keep_dtype(a, axis, dtype, out, keepdims)
cpdef _ndarray_base _nanprod(_ndarray_base a, axis, dtype, out, keepdims):
if cupy.iscomplexobj(a):
return _nanprod_complex_dtype(a, axis, dtype, out, keepdims)
elif dtype is None:
return _nanprod_auto_dtype(a, axis, dtype, out, keepdims)
else:
return _nanprod_keep_dtype(a, axis, dtype, out, keepdims)
_sum_auto_dtype = create_reduction_func(
'cupy_sum',
('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('in0', 'a + b', 'out0 = type_out0_raw(a)', None), 0)
_sum_keep_dtype = create_reduction_func(
'cupy_sum_with_dtype',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('in0', 'a + b', 'out0 = type_out0_raw(a)', None), 0)
_nansum_auto_dtype = create_reduction_func(
'cupy_nansum',
('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('(in0 == in0) ? in0 : type_in0_raw(0)',
'a + b', 'out0 = type_out0_raw(a)', None), 0)
_nansum_keep_dtype = create_reduction_func(
'cupy_nansum_with_dtype',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('(in0 == in0) ? in0 : type_in0_raw(0)',
'a + b', 'out0 = type_out0_raw(a)', None), 0)
_nansum_complex_dtype = create_reduction_func(
'cupy_nansum_complex_dtype',
('F->F', 'D->D'),
('''
type_in0_raw((in0.real() == in0.real()) ? in0.real() : 0,
(in0.imag() == in0.imag()) ? in0.imag() : 0)
''',
'a + b', 'out0 = type_out0_raw(a)', None), 0)
_prod_auto_dtype = create_reduction_func(
'cupy_prod',
('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('in0', 'a * b', 'out0 = type_out0_raw(a)', None), 1)
_prod_keep_dtype = create_reduction_func(
'cupy_prod_with_dtype',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('in0', 'a * b', 'out0 = type_out0_raw(a)', None), 1)
_nanprod_auto_dtype = create_reduction_func(
'cupy_nanprod',
('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('(in0 == in0) ? in0 : type_in0_raw(1)',
'a * b', 'out0 = type_out0_raw(a)', None), 1)
_nanprod_keep_dtype = create_reduction_func(
'cupy_nanprod_with_dtype',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('(in0 == in0) ? in0 : type_in0_raw(1)',
'a * b', 'out0 = type_out0_raw(a)', None), 1)
_nanprod_complex_dtype = create_reduction_func(
'cupy_nanprod_complex_dtype',
('F->F', 'D->D'),
('''
type_in0_raw((in0.real() == in0.real()) ? in0.real() : 1,
(in0.imag() == in0.imag()) ? in0.imag() : 1)
''',
'a * b', 'out0 = type_out0_raw(a)', None), 1)
cdef create_arithmetic(
name, op, boolop, doc, cutensor_op=None, scatter_op=None):
# boolop is either
# - str (the operator for bool-bool inputs) or
# - callable (a function to raise an error for bool-bool inputs).
if isinstance(boolop, str):
boolop = 'out0 = in0 %s in1' % boolop
return create_ufunc(
'cupy_' + name,
(('??->?', boolop),
'bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
'LL->L', 'qq->q', 'QQ->Q', 'ee->e', 'ff->f', 'dd->d', 'FF->F',
'DD->D'),
'out0 = in0 %s in1' % op,
doc=doc,
cutensor_op=cutensor_op,
scatter_op=scatter_op)
_add = create_arithmetic(
'add', '+', '|',
'''Adds two arrays elementwise.
.. seealso:: :data:`numpy.add`
''',
cutensor_op=('OP_ADD', 1, 1), scatter_op='add')
_conjugate = create_ufunc(
'cupy_conjugate',
('b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', 'q->q',
'Q->Q', 'e->e', 'f->f', 'd->d',
('F->F', 'out0 = conj(in0)'),
('D->D', 'out0 = conj(in0)')),
'out0 = in0',
doc='''Returns the complex conjugate, element-wise.
.. seealso:: :data:`numpy.conjugate`
''')
_angle = create_ufunc(
'cupy_angle',
('?->d', 'e->e', 'f->f', 'd->d',
('F->f', 'out0 = arg(in0)'),
('D->d', 'out0 = arg(in0)')),
'out0 = in0 >= 0 ? 0 : M_PI',
doc='''Returns the angle of the complex argument.
.. seealso:: :func:`numpy.angle`
''')
_angle_deg = create_ufunc(
'cupy_angle_deg',
('?->d', 'e->e', 'f->f', 'd->d',
('F->f', 'out0 = arg(in0) * (180.0 / M_PI)'),
('D->d', 'out0 = arg(in0) * (180.0 / M_PI)')),
'out0 = in0 >= 0 ? 0 : 180.0',
doc='''Returns the angle of the complex argument.
.. seealso:: :func:`numpy.angle`
''')
def _positive_boolean_error():
raise TypeError(
'The cupy boolean positive, the `+` operator, is not supported.')
_positive = create_ufunc(
'cupy_positive',
(('?->?', _positive_boolean_error),
'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
'out0 = +in0',
doc='''Takes numerical positive elementwise.
.. seealso:: :data:`numpy.positive`
''')
def _negative_boolean_error():
raise TypeError(
'The cupy boolean negative, the `-` operator, is not supported, '
'use the `~` operator or the logical_not function instead.')
_negative = create_ufunc(
'cupy_negative',
(('?->?', _negative_boolean_error),
'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
'out0 = -in0',
doc='''Takes numerical negative elementwise.
.. seealso:: :data:`numpy.negative`
''')
_multiply = create_arithmetic(
'multiply', '*', '&',
'''Multiplies two arrays elementwise.
.. seealso:: :data:`numpy.multiply`
''',
cutensor_op=('OP_MUL', 1, 1))
# `integral_power` should return somewhat appropriate values for negative
# integral powers (for which NumPy would raise errors). Hence the branches in
# the beginning. This behavior is not officially documented and could change.
cdef _power_preamble = '''
template <typename T>
inline __device__ T integral_power(T in0, T in1) {
if (in1 < 0) {
if (in0 == -1) {return (in1 & 1) ? -1 : 1;}
else {return (in0 == 1) ? 1 : 0;}
}
T out0 = 1;
while (in1 > 0) {
if (in1 & 1) {
out0 *= in0;
}
in0 *= in0;
in1 >>= 1;
}
return out0;
}
template <typename T>
inline __device__ T complex_power(T in0, T in1) {
return in1 == T(0) ? T(1): pow(in0, in1);
}
'''
_power = create_ufunc(
'cupy_power',
('??->b', 'bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
'LL->L', 'qq->q', 'QQ->Q',
('ee->e', 'out0 = powf(in0, in1)'),
('ff->f', 'out0 = powf(in0, in1)'),
('dd->d', 'out0 = pow(in0, in1)'),
('FF->F', 'out0 = complex_power(in0, in1)'),
('DD->D', 'out0 = complex_power(in0, in1)')),
'out0 = integral_power(in0, in1)',
preamble=_power_preamble,
doc='''Computes ``x1 ** x2`` elementwise.
.. seealso:: :data:`numpy.power`
''')
def _subtract_boolean_error():
raise TypeError(
'cupy boolean subtract, the `-` operator, is deprecated, use the '
'bitwise_xor, the `^` operator, or the logical_xor function instead.')
_subtract = create_arithmetic(
'subtract', '-', _subtract_boolean_error,
'''Subtracts arguments elementwise.
.. seealso:: :data:`numpy.subtract`
''',
cutensor_op=('OP_ADD', 1, -1), scatter_op='sub')
_true_divide = create_ufunc(
'cupy_true_divide',
('bb->d', 'BB->d', 'hh->d', 'HH->d', 'ii->d', 'II->d', 'll->d', 'LL->d',
'qq->d', 'QQ->d', 'ee->e', 'ff->f', 'dd->d', 'FF->F', 'DD->D'),
'out0 = (out0_type)in0 / (out0_type)in1',
doc='''Elementwise true division (i.e. division as floating values).
.. seealso:: :data:`numpy.true_divide`
''',
out_ops=('ee->e', 'ff->f', 'dd->d', 'FF->F', 'DD->D'),
)
_divide = _true_divide
_floor_divide = create_ufunc(
'cupy_floor_divide',
('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', 'LL->L',
'qq->q', 'QQ->Q', 'ee->e', 'ff->f', 'dd->d'),
'out0 = _floor_divide(in0, in1)',
doc='''Elementwise floor division (i.e. integer quotient).
.. seealso:: :data:`numpy.floor_divide`
''')
_remainder = create_ufunc(
'cupy_remainder',
('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', 'LL->L',
'qq->q', 'QQ->Q',
('ee->e', 'out0 = in0 - _floor_divide(in0, in1) * in1'),
('ff->f', 'out0 = in0 - _floor_divide(in0, in1) * in1'),
('dd->d', 'out0 = in0 - _floor_divide(in0, in1) * in1')),
'out0 = (in0 - _floor_divide(in0, in1) * in1) * (in1 != 0)',
doc='''Computes the remainder of Python division elementwise.
.. seealso:: :data:`numpy.remainder`
''')
_absolute = create_ufunc(
'cupy_absolute',
(('?->?', 'out0 = in0'),
'b->b', ('B->B', 'out0 = in0'), 'h->h', ('H->H', 'out0 = in0'),
'i->i', ('I->I', 'out0 = in0'), 'l->l', ('L->L', 'out0 = in0'),
'q->q', ('Q->Q', 'out0 = in0'),
('e->e', 'out0 = fabsf(in0)'),
('f->f', 'out0 = fabsf(in0)'),
('d->d', 'out0 = fabs(in0)'),
('F->f', 'out0 = abs(in0)'),
('D->d', 'out0 = abs(in0)')),
'out0 = in0 > 0 ? in0 : -in0',
doc='''Elementwise absolute value function.
.. seealso:: :data:`numpy.absolute`
''')
_sqrt = create_ufunc(
'cupy_sqrt',
('e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
'out0 = sqrt(in0)',
doc='''Elementwise square root function.
.. seealso:: :data:`numpy.sqrt`
''')
_clip = create_ufunc(
'cupy_clip',
('???->?', 'bbb->b', 'BBB->B', 'hhh->h', 'HHH->H', 'iii->i', 'III->I',
'lll->l', 'LLL->L', 'qqq->q', 'QQQ->Q', 'eee->e', 'fff->f', 'ddd->d'),
'out0 = in1 > in2 ? in2 : (in0 < in1 ? in1 : (in0 > in2 ? in2 : in0))')
# Variables to expose to Python
# (cythonized data cannot be exposed to Python, even with cpdef.)
add = _add
conjugate = _conjugate
angle = _angle
angle_deg = _angle_deg
positive = _positive
negative = _negative
multiply = _multiply
divide = _divide
power = _power
subtract = _subtract
true_divide = _true_divide
floor_divide = _floor_divide
remainder = _remainder
absolute = _absolute
sqrt = _sqrt
sum_auto_dtype = _sum_auto_dtype # used from cupy/math/sumprod.py
nansum_auto_dtype = _nansum_auto_dtype # used from cupy/math/sumprod.py
prod_auto_dtype = _prod_auto_dtype # used from cupy/math/sumprod.py
nanprod_auto_dtype = _nanprod_auto_dtype # used from cupy/math/sumprod.py
clip = _clip # used from cupy/math/misc.py
from cupy._core.core cimport _ndarray_base
cdef _ndarray_sort(_ndarray_base self, int axis)
cdef _ndarray_base _ndarray_argsort(_ndarray_base self, axis)
cdef _ndarray_partition(_ndarray_base self, kth, int axis)
cdef _ndarray_base _ndarray_argpartition(self, kth, axis)
import string
import numpy
import cupy
from cupy._core._scalar import get_typename as _get_typename
from cupy._core._ufuncs import elementwise_copy
import cupy._core.core as core
from cupy import _util
from cupy.cuda import thrust
from cupy._core cimport _routines_manipulation as _manipulation
from cupy._core.core cimport compile_with_cache
from cupy._core.core cimport _ndarray_base
from cupy._core cimport internal
cdef _ndarray_sort(_ndarray_base self, int axis):
cdef int ndim = self._shape.size()
cdef _ndarray_base data
if not cupy.cuda.thrust.available:
raise RuntimeError('Thrust is needed to use cupy.sort. Please '
'install CUDA Toolkit with Thrust then '
'reinstall CuPy after uninstalling it.')
if ndim == 0:
raise numpy.AxisError('Sorting arrays with the rank of zero is not '
'supported') # as numpy.sort() raises
# TODO(takagi): Support sorting views
if not self._c_contiguous:
raise NotImplementedError('Sorting non-contiguous array is not '
'supported.')
axis = internal._normalize_axis_index(axis, ndim)
if axis == ndim - 1:
data = self
else:
data = _manipulation.rollaxis(self, axis, ndim).copy()
if ndim == 1:
thrust.sort(self.dtype, data.data.ptr, 0, self.shape)
else:
max_size = max(min(1 << 22, data.size) // data.shape[-1], 1)
keys_array = core.ndarray(
(max_size * data.shape[-1],), dtype=numpy.intp)
stop = data.size // data.shape[-1]
for offset in range(0, stop, max_size):
width = min(max_size, stop - offset)
thrust.sort(
self.dtype,
data.data.ptr + offset * data.shape[-1] * data.itemsize,
keys_array.data.ptr,
(width, data.shape[-1]),
)
if axis == ndim - 1:
pass
else:
data = _manipulation.rollaxis(data, -1, axis)
elementwise_copy(data, self)
cdef _ndarray_base _ndarray_argsort(_ndarray_base self, axis):
cdef int _axis, ndim
cdef _ndarray_base data
if not cupy.cuda.thrust.available:
raise RuntimeError('Thrust is needed to use cupy.argsort. Please '
'install CUDA Toolkit with Thrust then '
'reinstall CuPy after uninstalling it.')
self = cupy.atleast_1d(self)
ndim = self._shape.size()
if axis is None:
data = self.ravel()
_axis = -1
else:
data = self
_axis = axis
_axis = internal._normalize_axis_index(_axis, ndim)
if _axis == ndim - 1:
data = data.copy()
else:
data = _manipulation.rollaxis(data, _axis, ndim).copy()
shape = data.shape
idx_array = core.ndarray(shape, dtype=numpy.intp)
if ndim == 1:
thrust.argsort(self.dtype, idx_array.data.ptr, data.data.ptr, 0,
shape)
else:
keys_array = core.ndarray(shape, dtype=numpy.intp)
thrust.argsort(self.dtype, idx_array.data.ptr, data.data.ptr,
keys_array.data.ptr, shape)
if _axis == ndim - 1:
return idx_array
else:
return _manipulation.rollaxis(idx_array, -1, _axis)
cdef _ndarray_partition(_ndarray_base self, kth, int axis):
"""Partitions an array.
Args:
kth (int or sequence of ints): Element index to partition by. If
supplied with a sequence of k-th it will partition all elements
indexed by k-th of them into their sorted position at once.
axis (int): Axis along which to sort. Default is -1, which means
sort along the last axis.
.. seealso::
:func:`cupy.partition` for full documentation,
:meth:`numpy.ndarray.partition`
"""
cdef int ndim = self._shape.size()
cdef Py_ssize_t k, max_k, length, s, sz, t
cdef _ndarray_base data
if ndim == 0:
raise numpy.AxisError('Sorting arrays with the rank of zero is not '
'supported')
if not self._c_contiguous:
raise NotImplementedError('Sorting non-contiguous array is not '
'supported.')
axis = internal._normalize_axis_index(axis, ndim)
if axis == ndim - 1:
data = self
else:
data = _manipulation.rollaxis(self, axis, ndim).copy()
length = self._shape[axis]
if isinstance(kth, int):
kth = kth,
max_k = 0
for k in kth:
if k < 0:
k += length
if not (0 <= k < length):
raise ValueError('kth(={}) out of bounds {}'.format(k, length))
if max_k < k:
max_k = k
# For simplicity, max_k is round up to the power of 2. If max_k is
# already the power of 2, it is round up to the next power of 2 because
# we need to collect the first max(kth)+1 elements.
max_k = max(32, 1 << max_k.bit_length())
# The parameter t is the length of the list that stores elements to be
# selected for each thread. We divide the array into sz subarrays.
# These parameters are determined from the measurement on TITAN X.
t = 4
sz = 512
while sz > 0 and length // sz < max_k + 32 * t:
sz //= 2
sz *= self.size // length
# If the array size is small or k is large, we simply sort the array.
if length < 32 or sz <= 32 or max_k >= 1024:
# kth is ignored.
data.sort(axis=-1)
else:
shape = data.shape
data = data.ravel()
# For each subarray, we collect first k elements to the head.
kern, merge_kern = _partition_kernel(self.dtype)
block_size = 32
grid_size = sz
kern(grid=(grid_size,), block=(block_size,), args=(
data, max_k, self.size, t, sz))
# Merge heads of subarrays.
s = 1
while s < sz // (self.size // length):
block_size = 32
grid_size = sz // s // 2
merge_kern(grid=(grid_size,), block=(block_size,), args=(
data, max_k, self.size, sz, s))
s *= 2
data = data.reshape(shape)
if axis != ndim - 1:
data = _manipulation.rollaxis(data, -1, axis)
elementwise_copy(data, self)
cdef _ndarray_base _ndarray_argpartition(self, kth, axis):
"""Returns the indices that would partially sort an array.
Args:
kth (int or sequence of ints): Element index to partition by. If
supplied with a sequence of k-th it will partition all elements
indexed by k-th of them into their sorted position at once.
axis (int or None): Axis along which to sort. Default is -1, which
means sort along the last axis. If None is supplied, the array
is flattened before sorting.
Returns:
cupy.ndarray: Array of the same type and shape as ``a``.
.. seealso::
:func:`cupy.argpartition` for full documentation,
:meth:`numpy.ndarray.argpartition`
"""
cdef int _axis, ndim
cdef Py_ssize_t k, max_k, length, s, sz, t
cdef _ndarray_base data
if axis is None:
data = self.ravel()
_axis = -1
else:
data = self
_axis = axis
ndim = data._shape.size()
_axis = internal._normalize_axis_index(_axis, ndim)
if _axis != ndim - 1:
data = _manipulation.rollaxis(self, _axis, ndim).copy()
length = data._shape[ndim - 1]
if length == 0:
return cupy.empty((0,), dtype=cupy.int64)
if isinstance(kth, int):
kth = kth,
max_k = 0
for k in kth:
if k < 0:
k += length
if not (0 <= k < length):
raise ValueError('kth(={}) out of bounds {}'.format(k, length))
if max_k < k:
max_k = k
# For simplicity, max_k is round up to the power of 2. If max_k is
# already the power of 2, it is round up to the next power of 2 because
# we need to collect the first max(kth)+1 elements.
max_k = max(32, 1 << max_k.bit_length())
# The parameter t is the length of the list that stores elements to be
# selected for each thread. We divide the array into sz subarrays.
# These parameters are determined from the measurement on TITAN X.
t = 4
sz = 512
while sz > 0 and length // sz < max_k + 32 * t:
sz //= 2
sz *= self.size // length
shape = data.shape
# If the array size is small or k is large, we simply sort the array.
if length < 32 or sz < 1 or max_k >= 1024:
# kth is ignored.
indices = data.argsort(axis=-1)
else:
data = data.ravel()
indices = cupy.arange(0, data.shape[0], dtype=cupy.int64)
# For each subarray, we collect first k elements to the head.
kern, merge_kern = _argpartition_kernel(self.dtype)
block_size = 32
grid_size = sz
kern(grid=(grid_size,), block=(block_size,), args=(
data, indices, max_k, self.size, t, sz))
# Merge heads of subarrays.
s = 1
while s < sz // (self.size // length):
block_size = 32
grid_size = sz // s // 2
merge_kern(grid=(grid_size,), block=(block_size,), args=(
data, indices, max_k, self.size, sz, s))
s *= 2
# Rearrange indices w.r.t the original axis
axis_indices = cupy.unravel_index(indices, shape)
indices = axis_indices[-1]
indices = indices.reshape(shape)
if _axis != ndim - 1:
indices = _manipulation.rollaxis(indices, -1, _axis)
return indices
@_util.memoize(for_each_device=True)
def _partition_kernel(dtype):
name = 'partition_kernel'
merge_kernel = 'partition_merge_kernel'
dtype = _get_typename(dtype)
source = string.Template('''
template<typename T>
__device__ void bitonic_sort_step(CArray<T, 1, true> a,
ptrdiff_t x, ptrdiff_t y, int i, ptrdiff_t s, ptrdiff_t w) {
for (ptrdiff_t j = i; j < (y - x) / 2; j += 32) {
ptrdiff_t n = j + (j & -w);
T v = a[n + x], u = a[n + w + x];
if (n & s ? v < u : v > u) {
a[n + x] = u;
a[n + w + x] = v;
}
}
}
// Sort a[x:y].
template<typename T>
__device__ void bitonic_sort(
CArray<T, 1, true> a, ptrdiff_t x, ptrdiff_t y, int i) {
for (ptrdiff_t s = 2; s <= y - x; s *= 2) {
for (ptrdiff_t w = s / 2; w >= 1; w /= 2) {
bitonic_sort_step< T >(a, x, y, i, s, w);
}
}
}
// Merge first k elements and the next 32 times t elements.
template<typename T>
__device__ void merge(
CArray<T, 1, true> a,
int k, int i, ptrdiff_t x, ptrdiff_t z, int u) {
for (int s = i; s < u; s += 32) {
if (a[x + k - s - 1] > a[z + s]) {
T tmp = a[x + k - s - 1];
a[x + k - s - 1] = a[z + s];
a[z + s] = tmp;
}
}
// After merge step, the first k elements are already bitonic.
// Therefore, we do not need to fully sort.
for (int w = k / 2; w >= 1; w /= 2) {
bitonic_sort_step< T >(a, x, k + x, i, k, w);
}
}
extern "C" {
// In this function, 32 threads handle one subarray. This number equals to
// the warp size. The first k elements are always sorted and the next 32
// times t elements stored values that have possibilities to be selected.
__global__ void ${name}(
CArray<${dtype}, 1, true> a,
int k, ptrdiff_t n, int t, ptrdiff_t sz) {
// This thread handles a[z:m].
ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+ threadIdx.x;
ptrdiff_t z = i / 32 * n / sz;
ptrdiff_t m = (i / 32 + 1) * n / sz;
int id = i % 32;
int x = 0;
bitonic_sort< ${dtype} >(a, z, k + z, id);
ptrdiff_t j;
for (j = k + id + z; j < m - (m - z) % 32; j += 32) {
if (a[j] < a[k - 1 + z]) {
${dtype} tmp = a[k + 32 * x + id + z];
a[k + 32 * x + id + z] = a[j];
a[j] = tmp;
++x;
}
// If at least one thread in the warp has found t values that
// can be selected, we update the first k elements.
#if __CUDACC_VER_MAJOR__ >= 9
if (__any_sync(0xffffffff, x >= t)) {
#else
if (__any(x >= t)) {
#endif
bitonic_sort< ${dtype} >(a, k + z, 32 * t + k + z, id);
merge< ${dtype} >(a, k, id, z, k + z, min(k, 32 * t));
x = 0;
}
}
if (j < m && a[j] < a[k - 1 + z]) {
${dtype} tmp = a[k + 32 * x + id + z];
a[k + 32 * x + id + z] = a[j];
a[j] = tmp;
}
// Finally, we merge the first k elements and the remainders to be
// stored.
bitonic_sort< ${dtype} >(a, k + z, 32 * t + k + z, id);
merge< ${dtype} >(a, k, id, z, k + z, min(k, 32 * t));
}
__global__ void ${merge_kernel}(
CArray<${dtype}, 1, true> a, int k, ptrdiff_t n, int sz, int s) {
ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+ threadIdx.x;
ptrdiff_t z = i / 32 * 2 * s * n / sz;
ptrdiff_t m = (i / 32 * 2 + 1) * s * n / sz;
int id = i % 32;
merge< ${dtype} >(a, k, id, z, m, k);
}
}
''').substitute(name=name, merge_kernel=merge_kernel, dtype=dtype)
module = compile_with_cache(source)
return module.get_function(name), module.get_function(merge_kernel)
@_util.memoize(for_each_device=True)
def _argpartition_kernel(dtype):
name = 'argpartition_kernel'
merge_kernel = 'argpartition_merge_kernel'
dtype = _get_typename(dtype)
source = string.Template('''
template<typename T>
__device__ void bitonic_sort_step(
CArray<T, 1, true> a, CArray<long long, 1, true> b,
ptrdiff_t x, ptrdiff_t y, int i, ptrdiff_t s, ptrdiff_t w) {
for (ptrdiff_t j = i; j < (y - x) / 2; j += 32) {
ptrdiff_t n = j + (j & -w);
T v = a[b[n + x]], u = a[b[n + w + x]];
if (n & s ? v < u : v > u) {
long long temp = b[n + x];
b[n + x] = b[n + w + x];
b[n + w + x] = temp;
}
}
}
// Sort a[x:y].
template<typename T>
__device__ void bitonic_sort(
CArray<T, 1, true> a, CArray<long long, 1, true> b,
ptrdiff_t x, ptrdiff_t y, int i) {
for (ptrdiff_t s = 2; s <= y - x; s *= 2) {
for (ptrdiff_t w = s / 2; w >= 1; w /= 2) {
bitonic_sort_step< T >(a, b, x, y, i, s, w);
}
}
}
// Merge first k elements and the next 32 times t elements.
template<typename T>
__device__ void merge(
CArray<T, 1, true> a, CArray<long long, 1, true> b,
int k, int i, ptrdiff_t x, ptrdiff_t z, int u) {
for (int s = i; s < u; s += 32) {
if (a[b[x + k - s - 1]] > a[b[z + s]]) {
long long tmp = b[x + k - s - 1];
b[x + k - s - 1] = b[z + s];
b[z + s] = tmp;
}
}
// After merge step, the first k elements are already bitonic.
// Therefore, we do not need to fully sort.
for (int w = k / 2; w >= 1; w /= 2) {
bitonic_sort_step< T >(a, b, x, k + x, i, k, w);
}
}
extern "C" {
// In this function, 32 threads handle one subarray. This number equals to
// the warp size. The first k elements are always sorted and the next 32
// times t elements stored values that have possibilities to be selected.
__global__ void ${name}(
CArray<${dtype}, 1, true> a, CArray<long long, 1, true> b,
int k, ptrdiff_t n, int t, ptrdiff_t sz) {
// This thread handles a[z:m].
ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+ threadIdx.x;
ptrdiff_t z = i / 32 * n / sz;
ptrdiff_t m = (i / 32 + 1) * n / sz;
int id = i % 32;
int x = 0;
bitonic_sort< ${dtype} >(a, b, z, k + z, id);
ptrdiff_t j;
for (j = k + id + z; j < m - (m - z) % 32; j += 32) {
if (a[b[j]] < a[b[k - 1 + z]]) {
long long tmp = b[k + 32 * x + id + z];
b[k + 32 * x + id + z] = b[j];
b[j] = tmp;
++x;
}
// If at least one thread in the warp has found t values that
// can be selected, we update the first k elements.
#if __CUDACC_VER_MAJOR__ >= 9
if (__any_sync(0xffffffff, x >= t)) {
#else
if (__any(x >= t)) {
#endif
bitonic_sort< ${dtype} >(a, b, k + z, 32 * t + k + z, id);
merge< ${dtype} >(a, b, k, id, z, k + z, min(k, 32 * t));
x = 0;
}
}
if (j < m && a[b[j]] < a[b[k - 1 + z]]) {
long long tmp = b[k + 32 * x + id + z];
b[k + 32 * x + id + z] = b[j];
b[j] = tmp;
}
// Finally, we merge the first k elements and the remainders to be
// stored.
bitonic_sort< ${dtype} >(a, b, k + z, 32 * t + k + z, id);
merge< ${dtype} >(a, b, k, id, z, k + z, min(k, 32 * t));
}
__global__ void ${merge_kernel}(
CArray<${dtype}, 1, true> a, CArray<long long, 1, true> b,
int k, ptrdiff_t n, int sz, int s) {
ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+ threadIdx.x;
ptrdiff_t z = i / 32 * 2 * s * n / sz;
ptrdiff_t m = (i / 32 * 2 + 1) * s * n / sz;
int id = i % 32;
merge< ${dtype} >(a, b, k, id, z, m, k);
}
}
''').substitute(name=name, merge_kernel=merge_kernel, dtype=dtype)
module = compile_with_cache(source)
return module.get_function(name), module.get_function(merge_kernel)
from cupy._core.core cimport _ndarray_base
# TODO(niboshi): Move {nan,}arg{min,max} to sorting
cdef _ndarray_base _ndarray_max(_ndarray_base self, axis, out, dtype, keepdims)
cdef _ndarray_base _ndarray_min(_ndarray_base self, axis, out, dtype, keepdims)
cdef _ndarray_base _ndarray_ptp(_ndarray_base self, axis, out, keepdims)
cdef _ndarray_base _ndarray_argmax(
_ndarray_base self, axis, out, dtype, keepdims)
cdef _ndarray_base _ndarray_argmin(
_ndarray_base self, axis, out, dtype, keepdims)
cdef _ndarray_base _ndarray_mean(
_ndarray_base self, axis, dtype, out, keepdims)
cdef _ndarray_base _ndarray_var(
_ndarray_base self, axis, dtype, out, ddof, keepdims)
cdef _ndarray_base _ndarray_std(
_ndarray_base self, axis, dtype, out, ddof, keepdims)
cpdef _ndarray_base _median(
_ndarray_base a, axis, out, overwrite_input, keepdims)
cpdef _ndarray_base _nanmean(_ndarray_base a, axis, dtype, out, keepdims)
cpdef _ndarray_base _nanvar(_ndarray_base a, axis, dtype, out, ddof, keepdims)
cpdef _ndarray_base _nanstd(_ndarray_base a, axis, dtype, out, ddof, keepdims)
cpdef _ndarray_base _nanargmin(_ndarray_base a, axis, out, dtype, keepdims)
cpdef _ndarray_base _nanargmax(_ndarray_base a, axis, out, dtype, keepdims)
from cpython cimport sequence
import numpy
from numpy import nan
import cupy
from cupy._core import _reduction
from cupy._core._reduction import create_reduction_func
from cupy._core._reduction import ReductionKernel
from cupy._core._kernel import ElementwiseKernel
from cupy._core._ufuncs import elementwise_copy
from cupy._core cimport _accelerator
from cupy._core cimport _routines_math as _math
from cupy._core.core cimport _ndarray_base
from cupy.cuda import cub
try:
import cupy_backends.cuda.libs.cutensor as cuda_cutensor
except ImportError:
cuda_cutensor = None
cdef _ndarray_base _ndarray_max(
_ndarray_base self, axis, out, dtype, keepdims):
for accelerator in _accelerator._routine_accelerators:
result = None
if accelerator == _accelerator.ACCELERATOR_CUB:
# result will be None if the reduction is not compatible with CUB
result = cub.cub_reduction(
self, cub.CUPY_CUB_MAX, axis, dtype, out, keepdims)
if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
cuda_cutensor is not None):
from cupyx import cutensor
if self.dtype.kind == 'c' or dtype in ('F', 'D'):
# Complex dtype is not supported
continue
result = cutensor._try_reduction_routine(
self, axis, dtype, out, keepdims, cuda_cutensor.OP_MAX, 1, 0)
if result is not None:
return result
return _amax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
cdef _ndarray_base _ndarray_min(
_ndarray_base self, axis, out, dtype, keepdims):
for accelerator in _accelerator._routine_accelerators:
result = None
if accelerator == _accelerator.ACCELERATOR_CUB:
# result will be None if the reduction is not compatible with CUB
result = cub.cub_reduction(
self, cub.CUPY_CUB_MIN, axis, out, dtype, keepdims)
if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
cuda_cutensor is not None):
from cupyx import cutensor
if self.dtype.kind == 'c' or dtype in ('F', 'D'):
# Complex dtype is not supported
continue
result = cutensor._try_reduction_routine(
self, axis, dtype, out, keepdims, cuda_cutensor.OP_MIN, 1, 0)
if result is not None:
return result
return _amin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
cdef _ndarray_base _ndarray_ptp(_ndarray_base self, axis, out, keepdims):
for accelerator in _accelerator._routine_accelerators:
if accelerator == _accelerator.ACCELERATOR_CUB:
# result will be None if the reduction is not compatible with CUB
result = cub.cub_reduction(
self, cub.CUPY_CUB_MAX, axis, out, None, keepdims)
if result is not None:
result -= cub.cub_reduction(
self, cub.CUPY_CUB_MIN, axis, None, None, keepdims)
return result
if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
cuda_cutensor is not None):
from cupyx import cutensor
if self.dtype.kind == 'c':
# Complex dtype is not supported
continue
maxv = cutensor._try_reduction_routine(
self, axis, None, out, keepdims, cuda_cutensor.OP_MAX, 1, 0)
if maxv is None:
continue
return cutensor._try_reduction_routine(
self, axis, None, maxv, keepdims, cuda_cutensor.OP_MIN, -1, 1)
result = _amax(self, axis=axis, out=out, keepdims=keepdims)
result -= _amin(self, axis=axis, out=None, keepdims=keepdims)
return result
# TODO(leofang): this signature is incompatible with NumPy!
cdef _ndarray_base _ndarray_argmax(
_ndarray_base self, axis, out, dtype, keepdims):
for accelerator in _accelerator._routine_accelerators:
if accelerator == _accelerator.ACCELERATOR_CUB:
# result will be None if the reduction is not compatible with CUB
if self._f_contiguous and self.dtype == numpy.bool_:
# temporary workaround casting the inputs to int8
# CUB argmax seems to return different values to
# NumPy for F-order bool array inputs
self = self.astype(numpy.int8)
result = cub.cub_reduction(
self, cub.CUPY_CUB_ARGMAX, axis, dtype, out, keepdims)
if result is not None:
return result
return _argmax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
# TODO(leofang): this signature is incompatible with NumPy!
cdef _ndarray_base _ndarray_argmin(
_ndarray_base self, axis, out, dtype, keepdims):
for accelerator in _accelerator._routine_accelerators:
if accelerator == _accelerator.ACCELERATOR_CUB:
# result will be None if the reduction is not compatible with CUB
result = cub.cub_reduction(
self, cub.CUPY_CUB_ARGMIN, axis, dtype, out, keepdims)
if result is not None:
return result
return _argmin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
cdef _ndarray_base _ndarray_mean(
_ndarray_base self, axis, dtype, out, keepdims):
cdef Py_ssize_t n
dtype_sum = dtype_out = dtype
if dtype is None:
if self.dtype.kind in 'iub':
dtype_out = numpy.float64
dtype_sum = numpy.float64
elif self.dtype.char == 'e':
dtype_sum = numpy.float32
dtype_out = numpy.float16
elif numpy.dtype(dtype).kind in 'iub':
# output will be the requested type, but compute the mean using float
dtype_out = dtype
dtype_sum = numpy.float64
for accelerator in _accelerator._routine_accelerators:
if accelerator == _accelerator.ACCELERATOR_CUB and self.size != 0:
result = cub.cub_reduction(
self, cub.CUPY_CUB_SUM, axis, dtype_sum, out, keepdims)
if result is not None:
n = self.size // result.size
cupy.true_divide(result, n, out=result, casting='unsafe')
break
if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
cuda_cutensor is not None):
from cupyx import cutensor
reduce_axis, _ = _reduction._get_axis(axis, self._shape.size())
n = 1
for i in reduce_axis:
n *= self._shape[i]
n = max(n, 1)
result = cutensor._try_reduction_routine(
self, axis, dtype_sum, out, keepdims,
cuda_cutensor.OP_ADD, 1.0 / n, 0)
if result is not None:
break
else:
result = _mean(
self, axis=axis, dtype=dtype_sum, out=out, keepdims=keepdims)
if dtype_out is not None and out is None:
result = result.astype(dtype_out)
return result
cdef _ndarray_base _ndarray_var(
_ndarray_base self, axis, dtype, out, ddof, keepdims):
return _var(
self, axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims)
cdef _ndarray_base _ndarray_std(
_ndarray_base self, axis, dtype, out, ddof, keepdims):
return _std(
self, axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims)
cdef _min_max_preamble = '''
template <typename T>
struct min_max_st{
T value;
int index;
__device__ min_max_st() : index(-1) { }
__device__ min_max_st(T v) : value(v), index(0) { }
__device__ min_max_st(T v, int i) : value(v), index(i) { }
};
template <typename T>
__device__ min_max_st<T> my_min(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
return min_max_st<T>(min(a.value, b.value));
}
template <typename T>
__device__ min_max_st<T> my_min_float(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
if (isnan(a.value)) return a;
if (isnan(b.value)) return b;
return min_max_st<T>(min(a.value, b.value));
}
template <typename T>
__device__ min_max_st<T> my_max(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
return min_max_st<T>(max(a.value, b.value));
}
template <typename T>
__device__ min_max_st<T> my_max_float(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
if (isnan(a.value)) return a;
if (isnan(b.value)) return b;
return min_max_st<T>(max(a.value, b.value));
}
template <typename T>
__device__ min_max_st<T> my_argmin(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
if (a.value == b.value)
return min_max_st<T>(a.value, min(a.index, b.index));
return (a.value <= b.value) ? a : b;
}
template <typename T>
__device__ min_max_st<T> my_argmin_float(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
if (a.value == b.value)
return min_max_st<T>(a.value, min(a.index, b.index));
if (isnan(a.value)) return a;
if (isnan(b.value)) return b;
return (a.value <= b.value) ? a : b;
}
template <typename T>
__device__ min_max_st<T> my_argmax(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
if (a.value == b.value)
return min_max_st<T>(a.value, min(a.index, b.index));
return (a.value >= b.value) ? a : b;
}
template <typename T>
__device__ min_max_st<T> my_argmax_float(
const min_max_st<T>& a, const min_max_st<T>& b) {
if (a.index == -1) return b;
if (b.index == -1) return a;
if (a.value == b.value)
return min_max_st<T>(a.value, min(a.index, b.index));
if (isnan(a.value)) return a;
if (isnan(b.value)) return b;
return (a.value >= b.value) ? a : b;
}
'''
cdef _amin = create_reduction_func(
'cupy_min',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, 'my_min_float(a, b)', None, None)),
('f->f', (None, 'my_min_float(a, b)', None, None)),
('d->d', (None, 'my_min_float(a, b)', None, None)),
('F->F', (None, 'my_min_float(a, b)', None, None)),
('D->D', (None, 'my_min_float(a, b)', None, None))),
('min_max_st<type_in0_raw>(in0)', 'my_min(a, b)', 'out0 = a.value',
'min_max_st<type_in0_raw>'),
None, _min_max_preamble)
cdef _amax = create_reduction_func(
'cupy_max',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q',
('e->e', (None, 'my_max_float(a, b)', None, None)),
('f->f', (None, 'my_max_float(a, b)', None, None)),
('d->d', (None, 'my_max_float(a, b)', None, None)),
('F->F', (None, 'my_max_float(a, b)', None, None)),
('D->D', (None, 'my_max_float(a, b)', None, None)),
),
('min_max_st<type_in0_raw>(in0)', 'my_max(a, b)', 'out0 = a.value',
'min_max_st<type_in0_raw>'),
None, _min_max_preamble)
nanmin = create_reduction_func(
'cupy_nanmin',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
('min_max_st<type_in0_raw>(in0)', 'my_min(a, b)', 'out0 = a.value',
'min_max_st<type_in0_raw>'),
None, _min_max_preamble)
nanmax = create_reduction_func(
'cupy_nanmax',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
('min_max_st<type_in0_raw>(in0)', 'my_max(a, b)', 'out0 = a.value',
'min_max_st<type_in0_raw>'),
None, _min_max_preamble)
cdef _argmin = create_reduction_func(
'cupy_argmin',
tuple(['{}->{}'.format(d, r) for r in 'qlihb' for d in '?BhHiIlLqQ'])
+ (
('e->q', (None, 'my_argmin_float(a, b)', None, None)),
('f->q', (None, 'my_argmin_float(a, b)', None, None)),
('d->q', (None, 'my_argmin_float(a, b)', None, None)),
('F->q', (None, 'my_argmin_float(a, b)', None, None)),
('D->q', (None, 'my_argmin_float(a, b)', None, None))),
('min_max_st<type_in0_raw>(in0, _J)', 'my_argmin(a, b)', 'out0 = a.index',
'min_max_st<type_in0_raw>'),
None, _min_max_preamble, sort_reduce_axis=False)
cdef _argmax = create_reduction_func(
'cupy_argmax',
tuple(['{}->{}'.format(d, r) for r in 'qlihb' for d in '?BhHiIlLqQ'])
+ (
('e->q', (None, 'my_argmax_float(a, b)', None, None)),
('f->q', (None, 'my_argmax_float(a, b)', None, None)),
('d->q', (None, 'my_argmax_float(a, b)', None, None)),
('F->q', (None, 'my_argmax_float(a, b)', None, None)),
('D->q', (None, 'my_argmax_float(a, b)', None, None))),
('min_max_st<type_in0_raw>(in0, _J)', 'my_argmax(a, b)', 'out0 = a.index',
'min_max_st<type_in0_raw>'),
None, _min_max_preamble, sort_reduce_axis=False)
cpdef _ndarray_base _nanargmax(_ndarray_base a, axis, out, dtype, keepdims):
return _nanargmax_func(
a, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
cpdef _ndarray_base _nanargmin(_ndarray_base a, axis, out, dtype, keepdims):
return _nanargmin_func(
a, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
cdef _nanargmin_func = create_reduction_func(
'cupy_nanargmin',
('?->q', 'B->q', 'h->q', 'H->q', 'i->q', 'I->q', 'l->q', 'L->q',
'q->q', 'Q->q',
('e->q', (None, 'my_argmin_float(a, b)', None, None)),
('f->q', (None, 'my_argmin_float(a, b)', None, None)),
('d->q', (None, 'my_argmin_float(a, b)', None, None)),
('F->q', (None, 'my_argmin_float(a, b)', None, None)),
('D->q', (None, 'my_argmin_float(a, b)', None, None))),
('min_max_st<type_in0_raw>(in0, isnan(in0) ? -1 : _J)',
'my_argmin(a, b)', 'out0 = a.index', 'min_max_st<type_in0_raw>'),
None, _min_max_preamble, sort_reduce_axis=False)
cdef _nanargmax_func = create_reduction_func(
'cupy_nanargmax',
('?->q', 'B->q', 'h->q', 'H->q', 'i->q', 'I->q', 'l->q', 'L->q',
'q->q', 'Q->q',
('e->q', (None, 'my_argmax_float(a, b)', None, None)),
('f->q', (None, 'my_argmax_float(a, b)', None, None)),
('d->q', (None, 'my_argmax_float(a, b)', None, None)),
('F->q', (None, 'my_argmax_float(a, b)', None, None)),
('D->q', (None, 'my_argmax_float(a, b)', None, None))),
('min_max_st<type_in0_raw>(in0, isnan(in0) ? -1 : _J)',
'my_argmax(a, b)', 'out0 = a.index', 'min_max_st<type_in0_raw>'),
None, _min_max_preamble, sort_reduce_axis=False)
cdef _exists_nan = ReductionKernel(
'T x', 'bool y', 'isnan(x)', 'a || b', 'y = a', 'false', '_exists_nan')
cpdef _ndarray_base _median(
_ndarray_base a, axis, out, overwrite_input, keepdims):
keep_ndim = a.ndim
out_shape = None
if sequence.PySequence_Check(axis):
# cupy.sort and cupy.partition only support integer axis, so move
# all reduced dimensions to the end and reshape them into a single
# reduction axis.
reduce_axis, out_axis = _reduction._get_axis(axis, keep_ndim)
out_shape = _reduction._get_out_shape(a.shape, reduce_axis, out_axis,
keepdims)
a = a.transpose(out_axis + reduce_axis)
sort_shape = tuple([a.shape[n] for n in range(len(out_axis))]) + (-1,)
a = a.reshape(sort_shape)
if not a.flags.c_contiguous:
a = cupy.ascontiguousarray(a)
axis = -1
if axis is None:
sz = a.size
else:
if axis < -keep_ndim or axis >= keep_ndim:
raise numpy.AxisError('Axis overrun')
sz = a.shape[axis]
if sz % 2 == 0:
szh = sz // 2
kth = [szh - 1, szh]
else:
kth = [(sz - 1) // 2]
if overwrite_input:
part = a
else:
part = a.copy()
if axis is None:
part = part.ravel()
part.partition(kth)
else:
part.partition(kth, axis=axis)
if part.shape == ():
return part
if axis is None:
axis = 0
indexer = [slice(None)] * part.ndim
if keepdims and out_shape is None:
_indexer = [None] * (keep_ndim - part.ndim)
indexer.extend(_indexer)
index = part.shape[axis] // 2
if part.shape[axis] % 2 == 1:
indexer[axis] = slice(index, index+1)
else:
indexer[axis] = slice(index-1, index+1)
indexer = tuple(indexer)
out = _mean(
part[indexer], axis=axis, dtype=None, out=out, keepdims=keepdims)
if part.dtype.kind in 'fc':
isnan = _exists_nan(part, axis=axis, keepdims=keepdims)
out = cupy.where(isnan, numpy.nan, out)
if out_shape is not None:
out = out.reshape(out_shape)
return out
cpdef _ndarray_base _nanmedian(
_ndarray_base a, axis, out, overwrite_input, keepdims):
if axis is None:
axis = tuple(range(a.ndim))
if not sequence.PySequence_Check(axis):
axis = (axis,)
reduce_axis = []
reduce_shape = []
out_axis = []
out_shape = []
for i in range(a.ndim):
if axis is None or i in axis or i - a.ndim in axis:
reduce_axis.append(i)
reduce_shape.append(a.shape[i])
else:
out_axis.append(i)
out_shape.append(a.shape[i])
a_data_ptr = a.data.ptr
a = a.transpose(out_axis + reduce_axis)
a = a.reshape(out_shape + [-1, ])
a = cupy.ascontiguousarray(a)
n_reduce = numpy.prod(reduce_shape)
n_reduce_each = cupy.full(out_shape, n_reduce, dtype='int32')
if a_data_ptr == a.data.ptr and overwrite_input is False:
a = a.copy()
_replace_nan_kernel(n_reduce, numpy.finfo(a.dtype).max, a, n_reduce_each)
a = cupy.sort(a, axis=-1)
b = cupy.full(out_shape, cupy.nan, dtype=a.dtype)
_pickup_median_kernel(n_reduce, n_reduce_each, a, b)
if keepdims:
b = b.reshape(out_shape + [1, ] * len(reduce_axis))
axes = [-1, ] * b.ndim
for i, j in enumerate(out_axis + reduce_axis):
axes[j] = i
b = b.transpose(axes)
if out is None:
out = b
else:
elementwise_copy(b, out)
return out
cdef _replace_nan_kernel = ElementwiseKernel(
'I n_reduce, T val', 'T a, raw I n_reduce_each',
'''
if (a != a) {
a = val;
atomicAdd(&(n_reduce_each[i / n_reduce]), -1);
}
''',
'cupy_replace_nan'
)
cdef _pickup_median_kernel = ElementwiseKernel(
'I n_reduce, I n_reduce_each, raw T a', 'T b',
'''
if (n_reduce_each > 0) {
int l = (n_reduce_each - 1) / 2;
int h = (n_reduce_each ) / 2;
if (l == h) {
b = a[l + n_reduce * i];
} else {
b = (a[l + n_reduce * i] + a[h + n_reduce * i])
/ static_cast<T>(2.0);
}
}
''',
'cupy_pickup_median'
)
cdef _ndarray_base _mean(
_ndarray_base a, axis=None, dtype=None, out=None, keepdims=False):
if a.size == 0:
# Return nan; see also https://github.com/numpy/numpy/issues/13582
return _mean_core_empty(a, axis, dtype, out, keepdims)
return _mean_core(a, axis, dtype, out, keepdims)
cdef _ndarray_base _var(
_ndarray_base a, axis=None, dtype=None, out=None, ddof=0,
keepdims=False):
if axis is None:
axis = tuple(range(a.ndim))
if not isinstance(axis, tuple):
axis = (axis,)
dtype_mean = a.dtype
dtype_out = numpy.dtype(dtype)
if dtype is None:
if a.dtype.kind in 'biu':
dtype_mean = 'float64'
dtype_out = 'float64'
else:
dtype_mean = a.dtype
dtype_out = a.dtype
if a.dtype.kind == 'c':
dtype_out = numpy.dtype(a.dtype.char.lower())
shape = a.shape
cdef Py_ssize_t items = 1
for ax in axis:
items *= shape[ax]
# Make alpha NaN when array is empty, mimics NumPy behavior, resulting in
# NaN. See https://github.com/numpy/numpy/issues/13582 for an explanation
# on why NaN is the result.
div = max(items - ddof, 0)
alpha = 1. / div if div != 0 else nan
arrmean = a.mean(axis=axis, dtype=dtype_mean, out=None, keepdims=True)
if out is None:
if dtype_out == 'float16':
var_core = _var_core_float16
elif dtype_out == 'float32':
var_core = _var_core_float32
else:
var_core = _var_core_float64
return var_core(a, arrmean, alpha, axis=axis, keepdims=keepdims)
out = _var_core_out(a, arrmean, alpha, out, axis=axis, keepdims=keepdims)
return out.astype(dtype_out, copy=False)
cdef _ndarray_base _std(
_ndarray_base a, axis=None, dtype=None, out=None, ddof=0,
keepdims=False):
ret = _var(
a, axis=axis, dtype=dtype, out=None, ddof=ddof, keepdims=keepdims)
return _math._sqrt(ret, dtype=dtype, out=out)
cdef _norm_preamble = '''
template <typename T> __device__ T my_norm(T x) { return x * x; }
__device__ float my_norm(const complex<float>& x) { return norm(x); }
__device__ double my_norm(const complex<double>& x) { return norm(x); }
'''
cdef _var_core_float16 = ReductionKernel(
'S x, T mean, float32 alpha', 'float16 out',
'my_norm(x - mean)',
'a + b', 'out = alpha * a', '0', 'cupy_var_core_float16',
preamble=_norm_preamble)
cdef _var_core_float32 = ReductionKernel(
'S x, T mean, float32 alpha', 'float32 out',
'my_norm(x - mean)',
'a + b', 'out = alpha * a', '0', 'cupy_var_core_float32',
preamble=_norm_preamble)
cdef _var_core_float64 = ReductionKernel(
'S x, T mean, float64 alpha', 'float64 out',
'my_norm(x - mean)',
'a + b', 'out = alpha * a', '0', 'cupy_var_core_float64',
preamble=_norm_preamble)
cdef _var_core_out = ReductionKernel(
'S x, T mean, U alpha', 'U out',
'my_norm(x - mean)',
'a + b', 'out = alpha * a', '0', 'cupy_var_core_out',
preamble=_norm_preamble)
# TODO(okuta) needs cast
cdef _mean_core = create_reduction_func(
'cupy_mean',
('?->d', 'B->d', 'h->d', 'H->d', 'i->d', 'I->d', 'l->d', 'L->d',
'q->d', 'Q->d',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('in0', 'a + b',
'out0 = a / _type_reduce(_in_ind.size() / _out_ind.size())', None))
cdef _mean_core_empty = create_reduction_func(
'cupy_mean_empty',
('?->d', 'B->d', 'h->d', 'H->d', 'i->d', 'I->d', 'l->d', 'L->d',
'q->d', 'Q->d',
('e->e', (None, None, None, 'float')),
'f->f', 'd->d', 'F->F', 'D->D'),
('in0', 'a + b',
'out0 = a / _type_reduce(_in_ind.size() / _out_ind.size())', None), 0)
cdef _nanmean_preamble = '''
template <typename T>
struct nanmean_st{
typedef long long ll;
T value;
ll count;
__device__ nanmean_st() : value(0), count(0) { }
__device__ nanmean_st(T v) :
value(isnan(v) ? T(0) : v), count(isnan(v) ? 0 : 1) { }
__device__ nanmean_st(T v, ll c) : value(v), count(c) { }
};
template <typename T>
__device__ nanmean_st<T> my_nanmean(
const nanmean_st<T>& a, const nanmean_st<T>& b) {
return nanmean_st<T>(a.value + b.value, a.count + b.count);
}
'''
cdef _nanmean_func = create_reduction_func(
'cupy_nanmean',
('e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
('in0', 'my_nanmean(a, b)',
'out0 = a.value / type_out0_raw(a.count)', 'nanmean_st<type_out0_raw>'),
None, _nanmean_preamble)
_count_non_nan = create_reduction_func(
'cupy_count_non_nan',
('e->q', 'f->q', 'd->q', 'F->q', 'D->q'),
('isnan(in0) ? 0 : 1', 'a + b', 'out0 = a', None), 0)
cpdef _ndarray_base _nanmean(_ndarray_base a, axis, dtype, out, keepdims):
return _nanmean_func(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
cpdef _ndarray_base _nanstd(_ndarray_base a, axis, dtype, out, ddof, keepdims):
var = _nanvar(a, axis, dtype, None, ddof, keepdims)
return _math._sqrt(var, dtype=dtype, out=out)
cpdef _ndarray_base _nanvar(_ndarray_base a, axis, dtype, out, ddof, keepdims):
_count = _count_non_nan(a, axis=axis, keepdims=True)
arrsum = _math._nansum(a, axis=axis, dtype=dtype, out=None, keepdims=True)
if out is None:
if a.dtype == cupy.complex64 or dtype == cupy.complex64:
nanvar_core = _nanvar_core_complex64
elif a.dtype == cupy.complex128 or dtype == cupy.complex128:
nanvar_core = _nanvar_core_complex128
else:
nanvar_core = _nanvar_core
out = nanvar_core(
a, arrsum, _count, ddof, axis=axis, keepdims=keepdims)
else:
_nanvar_core_out(
a, arrsum, _count, ddof, out, axis=axis, keepdims=keepdims)
return out
cdef _nanvar_preamble = '''
template <typename S, typename T>
__device__ T nanvar_impl(S x, T mean, long long alpha) {
return (isnan(x) ? T(0) : T((x - mean) * (x - mean))) / alpha;
}
template <typename S, typename T>
__device__ T nanvar_impl(complex<S> x, complex<T> mean, long long alpha) {
return (isnan(x) ? T(0) : T(norm(x - mean))) / alpha;
}
'''
cdef _nanvar_core = ReductionKernel(
'S x, T sum, int64 _count, int64 ddof', 'S out',
'nanvar_impl(x, sum / _count, max(_count - ddof, 0LL))',
'a + b', 'out = a', '0', '_nanvar_core', preamble=_nanvar_preamble)
cdef _nanvar_core_complex64 = ReductionKernel(
'complex64 x, complex64 sum, int64 _count, int64 ddof', 'float32 out',
'nanvar_impl(x, sum/static_cast<float>(_count), max(_count-ddof, 0LL))',
'a + b', 'out = a', '0', '_nanvar_core_complex64',
preamble=_nanvar_preamble)
cdef _nanvar_core_complex128 = ReductionKernel(
'complex128 x, complex128 sum, int64 _count, int64 ddof', 'float64 out',
'nanvar_impl(x, sum/static_cast<double>(_count), max(_count-ddof, 0LL))',
'a + b', 'out = a', '0', '_nanvar_core_complex128',
preamble=_nanvar_preamble)
cdef _nanvar_core_out = ReductionKernel(
'S x, T sum, int64 _count, int64 ddof', 'U out',
'nanvar_impl(x, sum / static_cast<T>(_count), max(_count - ddof, 0LL))',
'a + b', 'out = a', '0', '_nanvar_core', preamble=_nanvar_preamble)
# Variables to expose to Python
# (cythonized data cannot be exposed to Python, even with cpdef.)
amax = _amax
amin = _amin
cimport cython # NOQA
from libc.stdint cimport int8_t
from libc.stdint cimport int32_t
from cupy.cuda.function cimport CPointer
@cython.final
cdef class CScalar(CPointer):
cdef:
char kind
int8_t size
@staticmethod
cdef CScalar from_int32(int32_t value)
@staticmethod
cdef CScalar from_numpy_scalar_with_dtype(object x, object dtype)
@staticmethod
cdef CScalar _from_python_scalar(object x)
@staticmethod
cdef CScalar _from_numpy_scalar(object x)
cpdef apply_dtype(self, dtype)
cpdef get_numpy_type(self)
cpdef str get_typename(dtype)
cdef set scalar_type_set
cdef CScalar scalar_to_c_scalar(object x)
cdef object scalar_to_numpy_scalar(object x)
cpdef str _get_cuda_scalar_repr(obj, dtype)
from cpython cimport mem
from libc.stdint cimport int8_t
from libc.stdint cimport int16_t
from libc.stdint cimport int32_t
from libc.stdint cimport int64_t
from libc.stdint cimport uint8_t
from libc.stdint cimport uint16_t
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
import numpy
from cupy._core cimport _dtype
from cupy._core import _dtype as _dtype_module
from cupy._core cimport internal
cdef union Scalar:
bint bool_
int8_t int8_
int16_t int16_
int32_t int32_
int64_t int64_
uint8_t uint8_
uint16_t uint16_
uint32_t uint32_
uint64_t uint64_
float float32_
double float64_
cdef dict _typenames_base = {
numpy.dtype('float64'): 'double',
numpy.dtype('float32'): 'float',
numpy.dtype('float16'): 'float16',
numpy.dtype('complex128'): 'complex<double>',
numpy.dtype('complex64'): 'complex<float>',
numpy.dtype('int64'): 'long long',
numpy.dtype('int32'): 'int',
numpy.dtype('int16'): 'short',
numpy.dtype('int8'): 'signed char',
numpy.dtype('uint64'): 'unsigned long long',
numpy.dtype('uint32'): 'unsigned int',
numpy.dtype('uint16'): 'unsigned short',
numpy.dtype('uint8'): 'unsigned char',
numpy.dtype('bool'): 'bool',
}
cdef object _numpy_bool_ = numpy.bool_
cdef object _numpy_int8 = numpy.int8
cdef object _numpy_int16 = numpy.int16
cdef object _numpy_int32 = numpy.int32
cdef object _numpy_int64 = numpy.int64
cdef object _numpy_uint8 = numpy.uint8
cdef object _numpy_uint16 = numpy.uint16
cdef object _numpy_uint32 = numpy.uint32
cdef object _numpy_uint64 = numpy.uint64
cdef object _numpy_float16 = numpy.float16
cdef object _numpy_float32 = numpy.float32
cdef object _numpy_float64 = numpy.float64
cdef object _numpy_complex64 = numpy.complex64
cdef object _numpy_complex128 = numpy.complex128
cdef object _numpy_float_ = numpy.float_
cdef object _numpy_complex_ = numpy.complex_
cpdef str get_typename(dtype):
if dtype is None:
raise ValueError('dtype is None')
if dtype not in _typenames:
dtype = _dtype.get_dtype(dtype).type
return _typenames[dtype]
cdef dict _typenames = {}
cdef dict _dtype_kind_size_dict = {}
cdef _setup_type_dict():
cdef char k
for i in _dtype_module.all_type_chars:
d = numpy.dtype(i)
t = d.type
_typenames[t] = _typenames_base[d]
k = ord(d.kind)
_dtype_kind_size_dict[t] = (k, d.itemsize)
# CUDA types
for t in ('cudaTextureObject_t',):
_typenames[t] = t
_setup_type_dict()
cdef set _python_scalar_type_set = {int, float, bool, complex}
cdef set _numpy_scalar_type_set = set(_typenames.keys())
cdef set scalar_type_set = _python_scalar_type_set | _numpy_scalar_type_set
_int_iinfo = numpy.iinfo(int)
cdef _int_min = _int_iinfo.min
cdef _int_max = _int_iinfo.max
cdef _int_type = _int_iinfo.dtype.type
cdef bint _use_int32 = _int_type != _numpy_int64
del _int_iinfo
cpdef _python_scalar_to_numpy_scalar(x):
# Note that isinstance(x, int) matches with bool.
typ = type(x)
if typ is bool:
numpy_type = _numpy_bool_
elif typ is float:
numpy_type = _numpy_float_
elif typ is complex:
numpy_type = _numpy_complex_
else:
if 0x8000000000000000 <= x:
numpy_type = _numpy_uint64
elif _use_int32 and (x < _int_min or _int_max < x):
numpy_type = _numpy_int64
else:
# Generally `_int_type` is `numpy.int64`.
# On Windows, it is `numpy.int32`.
numpy_type = _int_type
return numpy_type(x)
cdef class CScalar(CPointer):
ndim = 0
def __cinit__(self):
self.ptr = mem.PyMem_Malloc(
max(sizeof(Scalar), sizeof(double complex)))
self.kind = 0
self.size = -1
def __dealloc__(self):
mem.PyMem_Free(self.ptr)
self.ptr = <void*>0
@staticmethod
cdef CScalar from_int32(int32_t value):
cdef CScalar s = CScalar.__new__(CScalar)
(<int32_t *>s.ptr)[0] = value
s.kind = b'i'
s.size = 4
return s
@staticmethod
cdef CScalar from_numpy_scalar_with_dtype(object x, object dtype):
cdef CScalar ret = CScalar._from_numpy_scalar(x)
ret.apply_dtype(dtype)
return ret
@staticmethod
cdef CScalar _from_python_scalar(object x):
cdef CScalar ret = CScalar.__new__(CScalar)
cdef Scalar* s = <Scalar*>ret.ptr
typ = type(x)
if typ is bool:
s.bool_ = x
ret.kind = b'b'
ret.size = 1
elif typ is float:
s.float64_ = x
ret.kind = b'f'
ret.size = 8
elif typ is complex:
(<double complex*>ret.ptr)[0] = x
ret.kind = b'c'
ret.size = 16
else:
if 0x8000000000000000 <= x:
s.uint64_ = x
ret.kind = b'u'
else:
s.int64_ = x
ret.kind = b'i'
ret.size = 8
return ret
@staticmethod
cdef CScalar _from_numpy_scalar(object x):
cdef CScalar ret = CScalar.__new__(CScalar)
cdef Scalar* s = <Scalar*>ret.ptr
ret.kind = ord(x.dtype.kind)
if ret.kind == b'i':
s.int64_ = x
ret.size = 8
elif ret.kind == b'u':
s.uint64_ = x
ret.size = 8
elif ret.kind == b'f':
s.float64_ = x
ret.size = 8
elif ret.kind == b'b':
s.bool_ = x
ret.size = 1
elif ret.kind == b'c':
(<double complex*>ret.ptr)[0] = x
ret.size = 16
else:
assert False
return ret
cpdef apply_dtype(self, dtype):
cdef Scalar* s = <Scalar*>self.ptr
if self.kind == b'b':
val = s.bool_
assert self.size == 1
elif self.kind == b'c':
assert self.size == 16
val = (<double complex*>self.ptr)[0]
else:
assert self.size == 8
if self.kind == b'i':
val = s.int64_
elif self.kind == b'u':
val = s.uint64_
elif self.kind == b'f':
val = s.float64_
else:
assert False
cdef char kind
cdef int size
kind, size = <tuple>_dtype_kind_size_dict[dtype]
cdef int64_t val_i
cdef uint64_t val_u
if kind == b'b':
s.bool_ = val
assert size == 1
elif kind == b'i':
if self.kind == b'u':
# avoid overflow exception
val_i = s.uint64_
else:
val_i = val
if size == 1:
s.int8_ = val_i
elif size == 2:
s.int16_ = val_i
elif size == 4:
s.int32_ = val_i
elif size == 8:
s.int64_ = val_i
else:
assert False
elif kind == b'u':
if self.kind == b'i':
# avoid overflow exception
val_u = s.int64_
else:
val_u = val
if size == 1:
s.uint8_ = val_u
elif size == 2:
s.uint16_ = val_u
elif size == 4:
s.uint32_ = val_u
elif size == 8:
s.uint64_ = val_u
else:
assert False
elif kind == b'f':
if size == 2:
s.uint16_ = internal.to_float16(<float>val)
elif size == 4:
s.float32_ = val
elif size == 8:
s.float64_ = val
else:
assert False
elif kind == b'c':
if size == 8:
(<float complex*>self.ptr)[0] = val
elif size == 16:
(<double complex*>self.ptr)[0] = val
else:
assert False
else:
assert False
self.kind = kind
self.size = size
cpdef get_numpy_type(self):
if self.kind == b'b':
return _numpy_bool_
elif self.kind == b'i':
if self.size == 1:
return _numpy_int8
elif self.size == 2:
return _numpy_int16
elif self.size == 4:
return _numpy_int32
elif self.size == 8:
return _numpy_int64
elif self.kind == b'u':
if self.size == 1:
return _numpy_uint8
elif self.size == 2:
return _numpy_uint16
elif self.size == 4:
return _numpy_uint32
elif self.size == 8:
return _numpy_uint64
elif self.kind == b'f':
if self.size == 2:
return _numpy_float16
elif self.size == 4:
return _numpy_float32
elif self.size == 8:
return _numpy_float64
elif self.kind == b'c':
if self.size == 8:
return _numpy_complex64
elif self.size == 16:
return _numpy_complex128
assert False
cdef CScalar scalar_to_c_scalar(object x):
# Converts a Python or NumPy scalar to a CScalar.
# Returns None if the argument is not a scalar.
typ = type(x)
if typ in _python_scalar_type_set:
return CScalar._from_python_scalar(x)
elif typ in _numpy_scalar_type_set:
return CScalar._from_numpy_scalar(x)
return None
cdef object scalar_to_numpy_scalar(object x):
# Converts a Python or NumPy scalar to a NumPy scalar.
# Returns None if the argument is not a scalar.
typ = type(x)
if typ in _python_scalar_type_set:
return _python_scalar_to_numpy_scalar(x)
elif typ in _numpy_scalar_type_set:
return x
return None
cpdef str _get_cuda_scalar_repr(obj, dtype):
if dtype.kind == 'b':
return str(bool(obj)).lower()
elif dtype.kind == 'i':
if dtype.itemsize < 8:
return str(int(obj))
else:
return str(int(obj)) + 'll'
elif dtype.kind == 'u':
if dtype.itemsize < 8:
return str(int(obj)) + 'u'
else:
return str(int(obj)) + 'ull'
elif dtype.kind == 'f':
if dtype.itemsize < 8:
if numpy.isnan(obj):
return 'CUDART_NAN_F'
elif numpy.isinf(obj):
if obj > 0:
return 'CUDART_INF_F'
else:
return '-CUDART_INF_F'
else:
return str(float(obj)) + 'f'
else:
if numpy.isnan(obj):
return 'CUDART_NAN'
elif numpy.isinf(obj):
if obj > 0:
return 'CUDART_INF'
else:
return '-CUDART_INF'
else:
return str(float(obj))
elif dtype.kind == 'c':
if dtype.itemsize == 8:
return f'thrust::complex<float>({obj.real}, {obj.imag})'
elif dtype.itemsize == 16:
return f'thrust::complex<double>({obj.real}, {obj.imag})'
raise TypeError(f'Unsupported dtype: {dtype}')
from cupy._core._kernel import create_ufunc
elementwise_copy = create_ufunc(
'cupy_copy',
('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
'out0 = in0',
default_casting='unsafe')
from libcpp cimport vector
from cupy.cuda cimport memory
from cupy.cuda.function cimport CPointer
from cupy.cuda.function cimport Module
from cupy._core._carray cimport shape_t
from cupy._core._carray cimport strides_t
cdef class _ndarray_base:
cdef:
object __weakref__
readonly Py_ssize_t size
public shape_t _shape
public strides_t _strides
readonly bint _c_contiguous
readonly bint _f_contiguous
# To do fast indexing in the CArray class
readonly bint _index_32_bits
readonly object dtype
readonly memory.MemoryPointer data
# TODO(niboshi): Return arbitrary owner object as `base` if the
# underlying memory is UnownedMemory.
readonly _ndarray_base base
cdef _init_fast(self, const shape_t& shape, dtype, bint c_order)
cpdef item(self)
cpdef tolist(self)
cpdef bytes tobytes(self, order=*)
cpdef tofile(self, fid, sep=*, format=*)
cpdef dump(self, file)
cpdef bytes dumps(self)
cpdef _ndarray_base astype(
self, dtype, order=*, casting=*, subok=*, copy=*)
cpdef _ndarray_base copy(self, order=*)
cpdef _ndarray_base view(self, dtype=*, array_class=*)
cpdef fill(self, value)
cpdef _ndarray_base swapaxes(self, Py_ssize_t axis1, Py_ssize_t axis2)
cpdef _ndarray_base flatten(self, order=*)
cpdef _ndarray_base ravel(self, order=*)
cpdef _ndarray_base squeeze(self, axis=*)
cpdef _ndarray_base take(self, indices, axis=*, out=*)
cpdef put(self, indices, values, mode=*)
cpdef repeat(self, repeats, axis=*)
cpdef choose(self, choices, out=*, mode=*)
cpdef sort(self, int axis=*)
cpdef _ndarray_base argsort(self, axis=*)
cpdef partition(self, kth, int axis=*)
cpdef _ndarray_base argpartition(self, kth, axis=*)
cpdef tuple nonzero(self)
cpdef _ndarray_base compress(self, condition, axis=*, out=*)
cpdef _ndarray_base diagonal(self, offset=*, axis1=*, axis2=*)
cpdef _ndarray_base max(self, axis=*, out=*, keepdims=*)
cpdef _ndarray_base argmax(self, axis=*, out=*, dtype=*, keepdims=*)
cpdef _ndarray_base min(self, axis=*, out=*, keepdims=*)
cpdef _ndarray_base argmin(self, axis=*, out=*, dtype=*, keepdims=*)
cpdef _ndarray_base ptp(self, axis=*, out=*, keepdims=*)
cpdef _ndarray_base clip(self, min=*, max=*, out=*)
cpdef _ndarray_base round(self, decimals=*, out=*)
cpdef _ndarray_base trace(self, offset=*, axis1=*, axis2=*, dtype=*, out=*)
cpdef _ndarray_base sum(self, axis=*, dtype=*, out=*, keepdims=*)
cpdef _ndarray_base cumsum(self, axis=*, dtype=*, out=*)
cpdef _ndarray_base mean(self, axis=*, dtype=*, out=*, keepdims=*)
cpdef _ndarray_base var(self, axis=*, dtype=*, out=*, ddof=*, keepdims=*)
cpdef _ndarray_base std(self, axis=*, dtype=*, out=*, ddof=*, keepdims=*)
cpdef _ndarray_base prod(self, axis=*, dtype=*, out=*, keepdims=*)
cpdef _ndarray_base cumprod(self, axis=*, dtype=*, out=*)
cpdef _ndarray_base _add_reduceat(self, indices, axis, dtype, out)
cpdef _ndarray_base all(self, axis=*, out=*, keepdims=*)
cpdef _ndarray_base any(self, axis=*, out=*, keepdims=*)
cpdef _ndarray_base conj(self)
cpdef _ndarray_base conjugate(self)
cpdef get(self, stream=*, order=*, out=*)
cpdef set(self, arr, stream=*)
cpdef _ndarray_base reduced_view(self, dtype=*)
cpdef _update_c_contiguity(self)
cpdef _update_f_contiguity(self)
cpdef _update_contiguity(self)
cpdef _set_shape_and_strides(self, const shape_t& shape,
const strides_t& strides,
bint update_c_contiguity,
bint update_f_contiguity)
cdef _ndarray_base _view(self, subtype, const shape_t& shape,
const strides_t& strides,
bint update_c_contiguity,
bint update_f_contiguity, obj)
cpdef _set_contiguous_strides(
self, Py_ssize_t itemsize, bint is_c_contiguous)
cdef CPointer get_pointer(self)
cpdef object toDlpack(self)
cpdef _ndarray_base _internal_ascontiguousarray(_ndarray_base a)
cpdef _ndarray_base _internal_asfortranarray(_ndarray_base a)
cpdef _ndarray_base ascontiguousarray(_ndarray_base a, dtype=*)
cpdef _ndarray_base asfortranarray(_ndarray_base a, dtype=*)
cpdef Module compile_with_cache(str source, tuple options=*, arch=*,
cachd_dir=*, prepend_cupy_headers=*,
backend=*, translate_cucomplex=*,
enable_cooperative_groups=*,
name_expressions=*, log_stream=*,
bint jitify=*)
# TODO(niboshi): Move to _routines_creation.pyx
cpdef _ndarray_base array(
obj, dtype=*, bint copy=*, order=*, bint subok=*, Py_ssize_t ndmin=*)
cpdef _ndarray_base _convert_object_with_cuda_array_interface(a)
cdef _ndarray_base _ndarray_init(subtype, const shape_t& shape, dtype, obj)
cdef _ndarray_base _create_ndarray_from_shape_strides(
subtype, const shape_t& shape, const strides_t& strides, dtype, obj)
# distutils: language = c++
import contextlib
import functools
import os
import pickle
import re
import warnings
import numpy
import cupy
from cupy._core._kernel import create_ufunc
from cupy._core._kernel import ElementwiseKernel
from cupy._core._ufuncs import elementwise_copy
from cupy._core import flags
from cupy._core import syncdetect
from cupy import cuda
from cupy.cuda import memory as memory_module
from cupy.cuda import stream as stream_mod
from cupy_backends.cuda.api.runtime import CUDARuntimeError
from cupy import _util
cimport cython # NOQA
from libc.stdint cimport int64_t, intptr_t
from cupy._core cimport _carray
from cupy._core cimport _dtype
from cupy._core._dtype cimport get_dtype
from cupy._core._kernel cimport create_ufunc
from cupy._core cimport _routines_binary as _binary
from cupy._core cimport _routines_indexing as _indexing
from cupy._core cimport _routines_linalg as _linalg
from cupy._core cimport _routines_logic as _logic
from cupy._core cimport _routines_manipulation as _manipulation
from cupy._core cimport _routines_math as _math
from cupy._core cimport _routines_sorting as _sorting
from cupy._core cimport _routines_statistics as _statistics
from cupy._core cimport _scalar
from cupy._core cimport dlpack
from cupy._core cimport internal
from cupy.cuda cimport device
from cupy.cuda cimport function
from cupy.cuda cimport pinned_memory
from cupy.cuda cimport memory
from cupy.cuda cimport stream as stream_module
from cupy_backends.cuda cimport stream as _stream_module
from cupy_backends.cuda.api cimport runtime
from cupy_backends.cuda.libs cimport cublas
# If rop of cupy.ndarray is called, cupy's op is the last chance.
# If op of cupy.ndarray is called and the `other` is cupy.ndarray, too,
# it is safe to call cupy's op.
# Otherwise, use this function `_should_use_rop` to choose
# * [True] return NotImplemented to defer rhs, or
# * [False] call NumPy's ufunc to try all `__array_ufunc__`.
# Note that extension types (`cdef class`) in Cython 0.x shares
# implementations of op and rop. (i.e. `__radd__(self, other)` is
# `__add__(other, self)`.)
#
# It follows NEP 13 except that cupy also implements the fallback to
# `__array_priority__`, which seems fair and necessary because of the
# following facts:
# * `numpy` : `scipy.sparse` = `cupy` : `cupyx.scipy.sparse`;
# * NumPy ignores `__array_priority__` attributes of arguments if NumPy finds
# `__array_function__` of `cupy.ndarray`;
# * SciPy sparse classes don't implement `__array_function__` and they even
# don't set `__array_function__ = None` to opt-out the feature; and
# * `__array_priority__` of SciPy sparse classes is respected because
# `numpy.ndarray.__array_function__` does not disable `__array_priority__`.
@cython.profile(False)
cdef inline _should_use_rop(x, y):
try:
y_ufunc = y.__array_ufunc__
except AttributeError:
# NEP 13's recommendation is `return False`.
xp = getattr(x, '__array_priority__', 0)
yp = getattr(y, '__array_priority__', 0)
return xp < yp
return y_ufunc is None
cdef tuple _HANDLED_TYPES
cdef object _null_context = contextlib.nullcontext()
class ndarray(_ndarray_base):
"""
__init__(self, shape, dtype=float, memptr=None, strides=None, order='C')
Multi-dimensional array on a CUDA device.
This class implements a subset of methods of :class:`numpy.ndarray`.
The difference is that this class allocates the array content on the
current GPU device.
Args:
shape (tuple of ints): Length of axes.
dtype: Data type. It must be an argument of :class:`numpy.dtype`.
memptr (cupy.cuda.MemoryPointer): Pointer to the array content head.
strides (tuple of ints or None): Strides of data in memory.
order ({'C', 'F'}): Row-major (C-style) or column-major
(Fortran-style) order.
Attributes:
base (None or cupy.ndarray): Base array from which this array is
created as a view.
data (cupy.cuda.MemoryPointer): Pointer to the array content head.
~ndarray.dtype(numpy.dtype): Dtype object of element type.
.. seealso::
`Data type objects (dtype) \
<https://numpy.org/doc/stable/reference/arrays.dtypes.html>`_
~ndarray.size (int): Number of elements this array holds.
This is equivalent to product over the shape tuple.
.. seealso:: :attr:`numpy.ndarray.size`
"""
__module__ = 'cupy'
def __new__(cls, *args, _obj=None, _no_init=False, **kwargs):
x = super().__new__(cls, *args, **kwargs)
if _no_init:
return x
x._init(*args, **kwargs)
if cls is not ndarray:
x.__array_finalize__(_obj)
return x
def __init__(self, *args, **kwargs):
# Prevent from calling the super class `_ndarray_base.__init__()` as
# it is used to check accidental direct instantiation of underlaying
# `_ndarray_base` extention.
pass
def __array_finalize__(self, obj):
pass
# We provide the Python-level wrapper of `view` method to follow NumPy's
# API signature, as it seems that Cython's `cpdef`d methods does not take
# an argument named `type`. Cython also does not take starargs
# (`*args` and `**kwargs`) for `cpdef`d methods so we can not interpret the
# arguments `dtype` and `type` from them.
def view(self, dtype=None, type=None):
"""Returns a view of the array.
Args:
dtype: If this is different from the data type of the array, the
returned view reinterpret the memory sequence as an array of
this type.
Returns:
cupy.ndarray: A view of the array. A reference to the original
array is stored at the :attr:`~ndarray.base` attribute.
.. seealso:: :meth:`numpy.ndarray.view`
"""
return super(ndarray, self).view(dtype=dtype, array_class=type)
cdef class _ndarray_base:
def __init__(self, *args, **kwargs):
# Raise an error if underlaying `_ndarray_base` extension type is
# directly instantiated. We must instantiate `ndarray` class instead
# for our ndarray subclassing mechanism.
raise RuntimeError('Must not be directly instantiated')
def _init(self, shape, dtype=float, memptr=None, strides=None,
order='C'):
cdef Py_ssize_t x, itemsize
cdef tuple s = internal.get_size(shape)
del shape
cdef int order_char = (
b'C' if order is None else internal._normalize_order(order))
# `strides` is prioritized over `order`, but invalid `order` should be
# checked even if `strides` is given.
if order_char != b'C' and order_char != b'F':
raise ValueError('order not understood. order=%s' % order)
# Check for erroneous shape
if len(s) > _carray.MAX_NDIM:
msg = 'maximum supported dimension for an ndarray is '
msg += f'{_carray.MAX_NDIM}, found {len(s)}'
raise ValueError(msg)
self._shape.reserve(len(s))
for x in s:
if x < 0:
raise ValueError('Negative dimensions are not allowed')
self._shape.push_back(x)
del s
# dtype
self.dtype, itemsize = _dtype.get_dtype_with_itemsize(dtype)
# Store shape and strides
if strides is not None:
if memptr is None:
raise ValueError('memptr is required if strides is given.')
self._set_shape_and_strides(self._shape, strides, True, True)
elif order_char == b'C':
self._set_contiguous_strides(itemsize, True)
elif order_char == b'F':
self._set_contiguous_strides(itemsize, False)
else:
assert False
# data
if memptr is None:
self.data = memory.alloc(self.size * itemsize)
self._index_32_bits = (self.size * itemsize) <= (1 << 31)
else:
self.data = memptr
bound = cupy._core._memory_range.get_bound(self)
self._index_32_bits = bound[1] - bound[0] <= (1 << 31)
cdef _init_fast(self, const shape_t& shape, dtype, bint c_order):
""" For internal ndarray creation. """
cdef Py_ssize_t itemsize
if shape.size() > _carray.MAX_NDIM:
msg = 'maximum supported dimension for an ndarray is '
msg += f'{_carray.MAX_NDIM}, found {shape.size()}'
raise ValueError(msg)
self._shape = shape
self.dtype, itemsize = _dtype.get_dtype_with_itemsize(dtype)
self._set_contiguous_strides(itemsize, c_order)
self.data = memory.alloc(self.size * itemsize)
self._index_32_bits = (self.size * itemsize) <= (1 << 31)
@property
def __cuda_array_interface__(self):
if runtime._is_hip_environment:
raise AttributeError(
'HIP/ROCm does not support cuda array interface')
cdef dict desc = {
'shape': self.shape,
'typestr': self.dtype.str,
'descr': self.dtype.descr,
}
cdef int ver = _util.CUDA_ARRAY_INTERFACE_EXPORT_VERSION
cdef intptr_t stream_ptr
if ver == 3:
stream_ptr = stream_module.get_current_stream_ptr()
# CAI v3 says setting the stream field to 0 is disallowed
if stream_ptr == 0:
stream_ptr = _stream_module.get_default_stream_ptr()
desc['stream'] = stream_ptr
elif ver == 2:
# Old behavior (prior to CAI v3): stream sync is explicitly handled
# by users. To restore this behavior, we do not export any stream
# if CUPY_CUDA_ARRAY_INTERFACE_EXPORT_VERSION is set to 2 (so that
# other participating libraries lacking a finer control over sync
# behavior can avoid syncing).
pass
else:
raise ValueError('CUPY_CUDA_ARRAY_INTERFACE_EXPORT_VERSION can '
'only be set to 3 (default) or 2')
desc['version'] = ver
if self._c_contiguous:
desc['strides'] = None
else:
desc['strides'] = self.strides
if self.size > 0:
desc['data'] = (self.data.ptr, False)
else:
desc['data'] = (0, False)
return desc
def __dlpack__(self, stream=None):
# Note: the stream argument is supplied by the consumer, not by CuPy
curr_stream = stream_module.get_current_stream()
curr_stream_ptr = curr_stream.ptr
# stream must be an int for CUDA/ROCm
if not runtime._is_hip_environment: # CUDA
if stream is None:
stream = runtime.streamLegacy
elif not isinstance(stream, int) or stream < -1:
# DLPack does not accept 0 as a valid stream, but there is a
# bug in PyTorch that exports the default stream as 0, which
# renders the protocol unusable, we will accept a 0 value
# meanwhile.
raise ValueError(
f'On CUDA, the valid stream for the DLPack protocol is -1,'
f' 1, 2, or any larger value, but {stream} was provided')
if stream == 0:
warnings.warn(
'Stream 0 is passed from a library that you are'
' converting to; CuPy assumes 0 as a legacy default '
'stream. Please report this problem to the library as this'
' violates the DLPack protocol.')
stream = runtime.streamLegacy
if curr_stream_ptr == 0:
curr_stream_ptr = runtime.streamLegacy
else: # ROCm/HIP
if stream is None:
stream = 0
elif (not isinstance(stream, int) or stream < -1
or stream in (1, 2)):
raise ValueError(
f'On ROCm/HIP, the valid stream for the DLPack protocol is'
f' -1, 0, or any value > 2, but {stream} was provided')
# if -1, no stream order should be established; otherwise, the consumer
# stream should wait for the work on CuPy's current stream to finish
if stream >= 0 and stream != curr_stream_ptr:
next_stream = stream_mod.ExternalStream(stream)
event = curr_stream.record()
next_stream.wait_event(event)
return dlpack.toDlpack(self)
def __dlpack_device__(self):
if not runtime._is_hip_environment:
attrs = runtime.pointerGetAttributes(self.data.ptr)
is_managed = (
attrs.type == runtime.memoryTypeManaged
and _util.DLPACK_EXPORT_VERSION >= (0, 6))
if is_managed:
device_type = dlpack.managed_CUDA
else:
device_type = dlpack.device_CUDA
else:
device_type = dlpack.device_ROCM
return (device_type, self.device.id)
# The definition order of attributes and methods are borrowed from the
# order of documentation at the following NumPy document.
# https://numpy.org/doc/stable/reference/arrays.ndarray.html
# -------------------------------------------------------------------------
# Memory layout
# -------------------------------------------------------------------------
@property
def flags(self):
"""Object containing memory-layout information.
It only contains ``c_contiguous``, ``f_contiguous``, and ``owndata``
attributes. All of these are read-only. Accessing by indexes is also
supported.
.. seealso:: :attr:`numpy.ndarray.flags`
"""
return flags.Flags(self._c_contiguous, self._f_contiguous,
self.base is None)
property shape:
"""Lengths of axes.
Setter of this property involves reshaping without copy. If the array
cannot be reshaped without copy, it raises an exception.
.. seealso: :attr:`numpy.ndarray.shape`
"""
def __get__(self):
return tuple(self._shape)
def __set__(self, newshape):
_manipulation._ndarray_shape_setter(self, newshape)
@property
def strides(self):
"""Strides of axes in bytes.
.. seealso:: :attr:`numpy.ndarray.strides`
"""
return tuple(self._strides)
@property
def ndim(self):
"""Number of dimensions.
``a.ndim`` is equivalent to ``len(a.shape)``.
.. seealso:: :attr:`numpy.ndarray.ndim`
"""
return self._shape.size()
@property
def itemsize(self):
"""Size of each element in bytes.
.. seealso:: :attr:`numpy.ndarray.itemsize`
"""
return self.dtype.itemsize
@property
def nbytes(self):
"""Total size of all elements in bytes.
It does not count skips between elements.
.. seealso:: :attr:`numpy.ndarray.nbytes`
"""
return self.size * self.dtype.itemsize
# -------------------------------------------------------------------------
# Other attributes
# -------------------------------------------------------------------------
@property
def T(self):
"""Shape-reversed view of the array.
If ndim < 2, then this is just a reference to the array itself.
"""
if self.ndim < 2:
return self
else:
return _manipulation._T(self)
@property
def flat(self):
return cupy.flatiter(self)
__array_priority__ = 100
# -------------------------------------------------------------------------
# Array interface
# -------------------------------------------------------------------------
# TODO(beam2d): Implement __array_interface__
# -------------------------------------------------------------------------
# foreign function interface
# -------------------------------------------------------------------------
@property
def cstruct(self):
"""C representation of the array.
This property is used for sending an array to CUDA kernels. The type of
returned C structure is different for different dtypes and ndims. The
definition of C type is written in ``cupy/carray.cuh``.
"""
return _CArray_from_ndarray(self)
# -------------------------------------------------------------------------
# Array conversion
# -------------------------------------------------------------------------
cpdef item(self):
"""Converts the array with one element to a Python scalar
Returns:
int or float or complex: The element of the array.
.. seealso:: :meth:`numpy.ndarray.item`
"""
if self.size != 1:
raise ValueError(
'can only convert an array of size 1 to a Python scalar')
return self.get().item()
cpdef tolist(self):
"""Converts the array to a (possibly nested) Python list.
Returns:
list: The possibly nested Python list of array elements.
.. seealso:: :meth:`numpy.ndarray.tolist`
"""
return self.get().tolist()
# TODO(okuta): Implement itemset
# TODO(okuta): Implement tostring
cpdef bytes tobytes(self, order='C'):
"""Turns the array into a Python bytes object."""
return self.get().tobytes(order)
cpdef tofile(self, fid, sep='', format='%s'):
"""Writes the array to a file.
.. seealso:: :meth:`numpy.ndarray.tofile`
"""
self.get().tofile(fid, sep, format)
cpdef dump(self, file):
"""Dumps a pickle of the array to a file.
Dumped file can be read back to :class:`cupy.ndarray` by
:func:`cupy.load`.
"""
pickle.dump(self, file, -1)
cpdef bytes dumps(self):
"""Dumps a pickle of the array to a string."""
return pickle.dumps(self, -1)
cpdef _ndarray_base astype(
self, dtype, order='K', casting=None, subok=None, copy=True):
"""Casts the array to given data type.
Args:
dtype: Type specifier.
order ({'C', 'F', 'A', 'K'}): Row-major (C-style) or column-major
(Fortran-style) order.
When ``order`` is 'A', it uses 'F' if ``a`` is column-major and
uses 'C' otherwise.
And when ``order`` is 'K', it keeps strides as closely as
possible.
copy (bool): If it is False and no cast happens, then this method
returns the array itself. Otherwise, a copy is returned.
Returns:
If ``copy`` is False and no cast is required, then the array itself
is returned. Otherwise, it returns a (possibly casted) copy of the
array.
.. note::
This method currently does not support ``casting``, and ``subok``
arguments.
.. seealso:: :meth:`numpy.ndarray.astype`
"""
cdef strides_t strides
# TODO(beam2d): Support casting and subok option
if casting is not None:
raise TypeError('casting is not supported yet')
if subok is not None:
raise TypeError('subok is not supported yet')
if order is None:
order = 'K'
cdef int order_char = internal._normalize_order(order)
dtype = get_dtype(dtype)
if dtype == self.dtype:
if not copy and (
order_char == b'K' or
order_char == b'A' and (self._c_contiguous or
self._f_contiguous) or
order_char == b'C' and self._c_contiguous or
order_char == b'F' and self._f_contiguous):
return self
order_char = internal._update_order_char(
self._c_contiguous, self._f_contiguous, order_char)
if order_char == b'K':
strides = internal._get_strides_for_order_K(self, dtype)
newarray = _ndarray_init(ndarray, self._shape, dtype, None)
# TODO(niboshi): Confirm update_x_contiguity flags
newarray._set_shape_and_strides(self._shape, strides, True, True)
else:
newarray = ndarray(self.shape, dtype=dtype, order=chr(order_char))
if self.size == 0:
# skip copy
if self.dtype.kind == 'c' and newarray.dtype.kind not in 'bc':
warnings.warn(
'Casting complex values to real discards the imaginary '
'part',
numpy.ComplexWarning)
else:
elementwise_copy(self, newarray)
return newarray
# TODO(okuta): Implement byteswap
cpdef _ndarray_base copy(self, order='C'):
"""Returns a copy of the array.
This method makes a copy of a given array in the current device.
Even when a given array is located in another device, you can copy it
to the current device.
Args:
order ({'C', 'F', 'A', 'K'}): Row-major (C-style) or column-major
(Fortran-style) order.
When ``order`` is 'A', it uses 'F' if ``a`` is column-major and
uses 'C' otherwise.
And when `order` is 'K', it keeps strides as closely as
possible.
.. seealso::
:func:`cupy.copy` for full documentation,
:meth:`numpy.ndarray.copy`
"""
cdef _ndarray_base x
if self.size == 0:
return self.astype(self.dtype, order=order)
dev_id = device.get_device_id()
if self.data.device_id == dev_id:
return self.astype(self.dtype, order=order)
# It need to make a contiguous copy for copying from another device
prev_device = runtime.getDevice()
try:
runtime.setDevice(self.device.id)
x = self.astype(self.dtype, order=order, copy=False)
finally:
runtime.setDevice(prev_device)
newarray = _ndarray_init(ndarray, x._shape, x.dtype, None)
if not x._c_contiguous and not x._f_contiguous:
raise NotImplementedError(
'CuPy cannot copy non-contiguous array between devices.')
# TODO(niboshi): Confirm update_x_contiguity flags
newarray._strides = x._strides
newarray._c_contiguous = x._c_contiguous
newarray._f_contiguous = x._f_contiguous
copy_context = _null_context
if runtime._is_hip_environment:
# HIP requires changing the active device to the one where
# src data is before the copy. From the docs:
# it is recommended to set the current device to the device
# where the src data is physically located.
copy_context = self.device
with copy_context:
newarray.data.copy_from_device_async(x.data, x.nbytes)
return newarray
cpdef _ndarray_base view(self, dtype=None, array_class=None):
cdef Py_ssize_t ndim, axis, tmp_size
cdef int self_is, v_is
if dtype is not None:
if type(dtype) is type and issubclass(dtype, ndarray):
if array_class is not None:
raise ValueError('Cannot specify output type twice.')
array_class = dtype
dtype = None
if (
array_class is not None and (
type(array_class) is not type or
not issubclass(array_class, ndarray)
)
):
raise ValueError('Type must be a sub-type of ndarray type')
if array_class is None:
array_class = type(self)
v = self._view(
array_class, self._shape, self._strides, False, False, self)
if dtype is None:
return v
v.dtype, v_is = _dtype.get_dtype_with_itemsize(dtype)
self_is = self.dtype.itemsize
if v_is == self_is:
return v
ndim = self._shape.size()
if ndim == 0:
raise ValueError(
'Changing the dtype of a 0d array is only supported if '
'the itemsize is unchanged')
axis = ndim - 1
if (
self._shape[axis] != 1
and self.size != 0
and self._strides[axis] != self.dtype.itemsize
):
raise ValueError(
'To change to a dtype of a different size, the last axis '
'must be contiguous')
# Normalize `_strides[axis]` whenever itemsize changes
v._strides[axis] = v_is
tmp_size = v._shape[axis] * self_is
if tmp_size % v_is != 0:
raise ValueError(
'When changing to a larger dtype, its size must be a '
'divisor of the total size in bytes of the last axis '
'of the array.')
# itemsize of dtype in CuPy is one of 1, 2, 4, 8, 16.
# Thus, CuPy does not raise the following:
# raise ValueError(
# 'When changing to a smaller dtype, its size must be a '
# 'divisor of the size of original dtype')
v._shape[axis] = tmp_size // v_is
v.size = v.size * self_is // v_is # divisible because shape[axis] is.
if axis != ndim - 1:
v._update_c_contiguity()
if axis != 0:
v._update_f_contiguity()
return v
# TODO(okuta): Implement getfield
# TODO(okuta): Implement setflags
cpdef fill(self, value):
"""Fills the array with a scalar value.
Args:
value: A scalar value to fill the array content.
.. seealso:: :meth:`numpy.ndarray.fill`
"""
if isinstance(value, cupy.ndarray):
if value.shape != ():
raise ValueError(
'non-scalar cupy.ndarray cannot be used for fill')
value = value.astype(self.dtype, copy=False)
fill_kernel(value, self)
return
if isinstance(value, numpy.ndarray):
if value.shape != ():
raise ValueError(
'non-scalar numpy.ndarray cannot be used for fill')
value = value.astype(self.dtype, copy=False).item()
if value == 0 and self._c_contiguous:
self.data.memset_async(0, self.nbytes)
else:
fill_kernel(value, self)
# -------------------------------------------------------------------------
# Shape manipulation
# -------------------------------------------------------------------------
def reshape(self, *shape, order='C'):
"""Returns an array of a different shape and the same content.
.. seealso::
:func:`cupy.reshape` for full documentation,
:meth:`numpy.ndarray.reshape`
"""
return _manipulation._ndarray_reshape(self, shape, order)
# TODO(okuta): Implement resize
def transpose(self, *axes):
"""Returns a view of the array with axes permuted.
.. seealso::
:func:`cupy.transpose` for full documentation,
:meth:`numpy.ndarray.reshape`
"""
return _manipulation._ndarray_transpose(self, axes)
cpdef _ndarray_base swapaxes(self, Py_ssize_t axis1, Py_ssize_t axis2):
"""Returns a view of the array with two axes swapped.
.. seealso::
:func:`cupy.swapaxes` for full documentation,
:meth:`numpy.ndarray.swapaxes`
"""
return _manipulation._ndarray_swapaxes(self, axis1, axis2)
cpdef _ndarray_base flatten(self, order='C'):
"""Returns a copy of the array flatten into one dimension.
Args:
order ({'C', 'F', 'A', 'K'}):
'C' means to flatten in row-major (C-style) order.
'F' means to flatten in column-major (Fortran-
style) order. 'A' means to flatten in column-major
order if `self` is Fortran *contiguous* in memory,
row-major order otherwise. 'K' means to flatten
`self` in the order the elements occur in memory.
The default is 'C'.
Returns:
cupy.ndarray: A copy of the array with one dimension.
.. seealso:: :meth:`numpy.ndarray.flatten`
"""
return _manipulation._ndarray_flatten(self, order)
cpdef _ndarray_base ravel(self, order='C'):
"""Returns an array flattened into one dimension.
.. seealso::
:func:`cupy.ravel` for full documentation,
:meth:`numpy.ndarray.ravel`
"""
return _internal_ascontiguousarray(
_manipulation._ndarray_ravel(self, order))
cpdef _ndarray_base squeeze(self, axis=None):
"""Returns a view with size-one axes removed.
.. seealso::
:func:`cupy.squeeze` for full documentation,
:meth:`numpy.ndarray.squeeze`
"""
return _manipulation._ndarray_squeeze(self, axis)
# -------------------------------------------------------------------------
# Item selection and manipulation
# -------------------------------------------------------------------------
cpdef _ndarray_base take(self, indices, axis=None, out=None):
"""Returns an array of elements at given indices along the axis.
.. seealso::
:func:`cupy.take` for full documentation,
:meth:`numpy.ndarray.take`
"""
return _indexing._ndarray_take(self, indices, axis, out)
cpdef put(self, indices, values, mode='wrap'):
"""Replaces specified elements of an array with given values.
.. seealso::
:func:`cupy.put` for full documentation,
:meth:`numpy.ndarray.put`
"""
return _indexing._ndarray_put(self, indices, values, mode)
cpdef repeat(self, repeats, axis=None):
"""Returns an array with repeated arrays along an axis.
.. seealso::
:func:`cupy.repeat` for full documentation,
:meth:`numpy.ndarray.repeat`
"""
return _manipulation._ndarray_repeat(self, repeats, axis)
cpdef choose(self, choices, out=None, mode='raise'):
# TODO(niboshi): Write docstring
return _indexing._ndarray_choose(self, choices, out, mode)
cpdef sort(self, int axis=-1):
"""Sort an array, in-place with a stable sorting algorithm.
Args:
axis (int): Axis along which to sort. Default is -1, which means
sort along the last axis.
.. note::
For its implementation reason, ``ndarray.sort`` currently supports
only arrays with their own data, and does not support ``kind`` and
``order`` parameters that ``numpy.ndarray.sort`` does support.
.. seealso::
:func:`cupy.sort` for full documentation,
:meth:`numpy.ndarray.sort`
"""
# TODO(takagi): Support kind argument.
_sorting._ndarray_sort(self, axis)
cpdef _ndarray_base argsort(self, axis=-1):
"""Returns the indices that would sort an array with stable sorting
Args:
axis (int or None): Axis along which to sort. Default is -1, which
means sort along the last axis. If None is supplied, the array
is flattened before sorting.
Returns:
cupy.ndarray: Array of indices that sort the array.
.. seealso::
:func:`cupy.argsort` for full documentation,
:meth:`numpy.ndarray.argsort`
"""
# TODO(takagi): Support kind argument.
return _sorting._ndarray_argsort(self, axis)
cpdef partition(self, kth, int axis=-1):
"""Partitions an array.
Args:
kth (int or sequence of ints): Element index to partition by. If
supplied with a sequence of k-th it will partition all elements
indexed by k-th of them into their sorted position at once.
axis (int): Axis along which to sort. Default is -1, which means
sort along the last axis.
.. seealso::
:func:`cupy.partition` for full documentation,
:meth:`numpy.ndarray.partition`
"""
_sorting._ndarray_partition(self, kth, axis)
cpdef _ndarray_base argpartition(self, kth, axis=-1):
"""Returns the indices that would partially sort an array.
Args:
kth (int or sequence of ints): Element index to partition by. If
supplied with a sequence of k-th it will partition all elements
indexed by k-th of them into their sorted position at once.
axis (int or None): Axis along which to sort. Default is -1, which
means sort along the last axis. If None is supplied, the array
is flattened before sorting.
Returns:
cupy.ndarray: Array of the same type and shape as ``a``.
.. seealso::
:func:`cupy.argpartition` for full documentation,
:meth:`numpy.ndarray.argpartition`
"""
return _sorting._ndarray_argpartition(self, kth, axis)
def searchsorted(self, v, side='left', sorter=None):
"""Finds indices where elements of v should be inserted to maintain order.
For full documentation, see :func:`cupy.searchsorted`
Returns:
.. seealso:: :func:`numpy.searchsorted`
""" # NOQA
return cupy.searchsorted(self, v, side, sorter)
cpdef tuple nonzero(self):
"""Return the indices of the elements that are non-zero.
Returned Array is containing the indices of the non-zero elements
in that dimension.
Returns:
tuple of arrays: Indices of elements that are non-zero.
.. warning::
This function may synchronize the device.
.. seealso::
:func:`numpy.nonzero`
"""
return _indexing._ndarray_nonzero(self)
cpdef _ndarray_base compress(self, condition, axis=None, out=None):
"""Returns selected slices of this array along given axis.
.. warning::
This function may synchronize the device.
.. seealso::
:func:`cupy.compress` for full documentation,
:meth:`numpy.ndarray.compress`
"""
return _indexing._ndarray_compress(self, condition, axis, out)
cpdef _ndarray_base diagonal(self, offset=0, axis1=0, axis2=1):
"""Returns a view of the specified diagonals.
.. seealso::
:func:`cupy.diagonal` for full documentation,
:meth:`numpy.ndarray.diagonal`
"""
return _indexing._ndarray_diagonal(self, offset, axis1, axis2)
# -------------------------------------------------------------------------
# Calculation
# -------------------------------------------------------------------------
cpdef _ndarray_base max(self, axis=None, out=None, keepdims=False):
"""Returns the maximum along a given axis.
.. seealso::
:func:`cupy.amax` for full documentation,
:meth:`numpy.ndarray.max`
"""
return _statistics._ndarray_max(self, axis, out, None, keepdims)
cpdef _ndarray_base argmax(
self, axis=None, out=None, dtype=None, keepdims=False):
"""Returns the indices of the maximum along a given axis.
.. note::
``dtype`` and ``keepdim`` arguments are specific to CuPy. They are
not in NumPy.
.. note::
``axis`` argument accepts a tuple of ints, but this is specific to
CuPy. NumPy does not support it.
.. seealso::
:func:`cupy.argmax` for full documentation,
:meth:`numpy.ndarray.argmax`
"""
return _statistics._ndarray_argmax(self, axis, out, dtype, keepdims)
cpdef _ndarray_base min(self, axis=None, out=None, keepdims=False):
"""Returns the minimum along a given axis.
.. seealso::
:func:`cupy.amin` for full documentation,
:meth:`numpy.ndarray.min`
"""
return _statistics._ndarray_min(self, axis, out, None, keepdims)
cpdef _ndarray_base argmin(
self, axis=None, out=None, dtype=None, keepdims=False):
"""Returns the indices of the minimum along a given axis.
.. note::
``dtype`` and ``keepdim`` arguments are specific to CuPy. They are
not in NumPy.
.. note::
``axis`` argument accepts a tuple of ints, but this is specific to
CuPy. NumPy does not support it.
.. seealso::
:func:`cupy.argmin` for full documentation,
:meth:`numpy.ndarray.argmin`
"""
return _statistics._ndarray_argmin(self, axis, out, dtype, keepdims)
cpdef _ndarray_base ptp(self, axis=None, out=None, keepdims=False):
"""Returns (maximum - minimum) along a given axis.
.. seealso::
:func:`cupy.ptp` for full documentation,
:meth:`numpy.ndarray.ptp`
"""
return _statistics._ndarray_ptp(self, axis, out, keepdims)
cpdef _ndarray_base clip(self, min=None, max=None, out=None):
"""Returns an array with values limited to [min, max].
.. seealso::
:func:`cupy.clip` for full documentation,
:meth:`numpy.ndarray.clip`
"""
return _math._ndarray_clip(self, min, max, out)
cpdef _ndarray_base round(self, decimals=0, out=None):
"""Returns an array with values rounded to the given number of decimals.
.. seealso::
:func:`cupy.around` for full documentation,
:meth:`numpy.ndarray.round`
""" # NOQA
return _round_ufunc(self, decimals, out=out)
cpdef _ndarray_base trace(
self, offset=0, axis1=0, axis2=1, dtype=None, out=None):
"""Returns the sum along diagonals of the array.
.. seealso::
:func:`cupy.trace` for full documentation,
:meth:`numpy.ndarray.trace`
"""
d = self.diagonal(offset, axis1, axis2)
return d.sum(-1, dtype, out, False)
cpdef _ndarray_base sum(
self, axis=None, dtype=None, out=None, keepdims=False):
"""Returns the sum along a given axis.
.. seealso::
:func:`cupy.sum` for full documentation,
:meth:`numpy.ndarray.sum`
"""
return _math._ndarray_sum(self, axis, dtype, out, keepdims)
cpdef _ndarray_base cumsum(self, axis=None, dtype=None, out=None):
"""Returns the cumulative sum of an array along a given axis.
.. seealso::
:func:`cupy.cumsum` for full documentation,
:meth:`numpy.ndarray.cumsum`
"""
return _math._ndarray_cumsum(self, axis, dtype, out)
cpdef _ndarray_base mean(
self, axis=None, dtype=None, out=None, keepdims=False):
"""Returns the mean along a given axis.
.. seealso::
:func:`cupy.mean` for full documentation,
:meth:`numpy.ndarray.mean`
"""
return _statistics._ndarray_mean(self, axis, dtype, out, keepdims)
cpdef _ndarray_base var(
self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
"""Returns the variance along a given axis.
.. seealso::
:func:`cupy.var` for full documentation,
:meth:`numpy.ndarray.var`
"""
return _statistics._ndarray_var(
self, axis, dtype, out, ddof, keepdims)
cpdef _ndarray_base std(
self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
"""Returns the standard deviation along a given axis.
.. seealso::
:func:`cupy.std` for full documentation,
:meth:`numpy.ndarray.std`
"""
return _statistics._ndarray_std(self, axis, dtype, out, ddof, keepdims)
cpdef _ndarray_base prod(
self, axis=None, dtype=None, out=None, keepdims=None):
"""Returns the product along a given axis.
.. seealso::
:func:`cupy.prod` for full documentation,
:meth:`numpy.ndarray.prod`
"""
return _math._ndarray_prod(self, axis, dtype, out, keepdims)
cpdef _ndarray_base cumprod(self, axis=None, dtype=None, out=None):
"""Returns the cumulative product of an array along a given axis.
.. seealso::
:func:`cupy.cumprod` for full documentation,
:meth:`numpy.ndarray.cumprod`
"""
return _math._ndarray_cumprod(self, axis, dtype, out)
cpdef _ndarray_base _add_reduceat(self, indices, axis, dtype, out):
return _indexing._add_reduceat(self, indices, axis, dtype, out)
cpdef _ndarray_base all(self, axis=None, out=None, keepdims=False):
# TODO(niboshi): Write docstring
return _logic._ndarray_all(self, axis, out, keepdims)
cpdef _ndarray_base any(self, axis=None, out=None, keepdims=False):
# TODO(niboshi): Write docstring
return _logic._ndarray_any(self, axis, out, keepdims)
# -------------------------------------------------------------------------
# Arithmetic and comparison operations
# -------------------------------------------------------------------------
# Comparison operators:
def __richcmp__(object self, object other, int op):
if isinstance(other, ndarray):
if op == 0:
return _logic._ndarray_less(self, other)
if op == 1:
return _logic._ndarray_less_equal(self, other)
if op == 2:
return _logic._ndarray_equal(self, other)
if op == 3:
return _logic._ndarray_not_equal(self, other)
if op == 4:
return _logic._ndarray_greater(self, other)
if op == 5:
return _logic._ndarray_greater_equal(self, other)
elif not _should_use_rop(self, other):
if isinstance(other, numpy.ndarray) and other.ndim == 0:
other = other.item() # Workaround for numpy<1.13
if op == 0:
return numpy.less(self, other)
if op == 1:
return numpy.less_equal(self, other)
if op == 2:
# cupy.ndarray does not support dtype=object, but
# allow comparison with None, Ellipsis, and etc.
if type(other).__eq__ is object.__eq__:
# Implies `other` is neither (Python/NumPy) scalar nor
# ndarray. With object's default __eq__, it never
# equals to an element of cupy.ndarray.
return cupy.zeros(self._shape, dtype=cupy.bool_)
return numpy.equal(self, other)
if op == 3:
if (
type(other).__eq__ is object.__eq__
and type(other).__ne__ is object.__ne__
):
# Similar to eq, but ne falls back to `not __eq__`.
return cupy.ones(self._shape, dtype=cupy.bool_)
return numpy.not_equal(self, other)
if op == 4:
return numpy.greater(self, other)
if op == 5:
return numpy.greater_equal(self, other)
return NotImplemented
# Truth value of an array (bool):
def __nonzero__(self):
if self.size == 0:
msg = ('The truth value of an empty array is ambiguous. Returning '
'False, but in future this will result in an error. Use '
'`array.size > 0` to check that an array is not empty.')
warnings.warn(msg, DeprecationWarning)
return False
elif self.size == 1:
return bool(self.get())
else:
msg = ('The truth value of an array with more than one element is '
'ambiguous. Use a.any() or a.all()')
raise ValueError(msg)
# Unary operations:
def __neg__(self):
return _math._negative(self)
def __pos__(self):
if self.dtype == numpy.bool_:
msg = ("Applying '+' to a non-numerical array is ill-defined. "
'Returning a copy, but in the future this will error.')
warnings.warn(msg, DeprecationWarning)
return self.copy()
return _math._positive(self)
def __abs__(self):
return _math._absolute(self)
def __invert__(self):
return _binary._invert(self)
# Arithmetic:
def __add__(x, y):
if isinstance(y, ndarray):
return _math._add(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.add(x, y)
def __sub__(x, y):
if isinstance(y, ndarray):
return _math._subtract(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.subtract(x, y)
def __mul__(x, y):
if isinstance(y, ndarray):
return _math._multiply(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.multiply(x, y)
def __matmul__(x, y):
if isinstance(y, ndarray):
return _linalg.matmul(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.matmul(x, y)
def __div__(x, y):
if isinstance(y, ndarray):
return _math._divide(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.divide(x, y)
def __truediv__(x, y):
if isinstance(y, ndarray):
return _math._true_divide(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.true_divide(x, y)
def __floordiv__(x, y):
if isinstance(y, ndarray):
return _math._floor_divide(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.floor_divide(x, y)
def __mod__(x, y):
if isinstance(y, ndarray):
return _math._remainder(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.remainder(x, y)
def __divmod__(x, y):
if isinstance(y, ndarray):
return divmod(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.divmod(x, y)
def __pow__(x, y, modulo):
# Note that we ignore the modulo argument as well as NumPy.
if isinstance(y, ndarray):
return _math._power(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.power(x, y)
def __lshift__(x, y):
if isinstance(y, ndarray):
return _binary._left_shift(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.left_shift(x, y)
def __rshift__(x, y):
if isinstance(y, ndarray):
return _binary._right_shift(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.right_shift(x, y)
def __and__(x, y):
if isinstance(y, ndarray):
return _binary._bitwise_and(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.bitwise_and(x, y)
def __or__(x, y):
if isinstance(y, ndarray):
return _binary._bitwise_or(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.bitwise_or(x, y)
def __xor__(x, y):
if isinstance(y, ndarray):
return _binary._bitwise_xor(x, y)
elif _should_use_rop(x, y):
return NotImplemented
else:
return numpy.bitwise_xor(x, y)
# Arithmetic, in-place:
def __iadd__(self, other):
return _math._add(self, other, self)
def __isub__(self, other):
return _math._subtract(self, other, self)
def __imul__(self, other):
return _math._multiply(self, other, self)
def __idiv__(self, other):
return _math._divide(self, other, self)
def __itruediv__(self, other):
return _math._true_divide(self, other, self)
def __ifloordiv__(self, other):
return _math._floor_divide(self, other, self)
def __imod__(self, other):
return _math._remainder(self, other, self)
def __ipow__(self, other):
return _math._power(self, other, self)
def __ilshift__(self, other):
return _binary._left_shift(self, other, self)
def __irshift__(self, other):
return _binary._right_shift(self, other, self)
def __iand__(self, other):
return _binary._bitwise_and(self, other, self)
def __ior__(self, other):
return _binary._bitwise_or(self, other, self)
def __ixor__(self, other):
return _binary._bitwise_xor(self, other, self)
cpdef _ndarray_base conj(self):
return _math._ndarray_conj(self)
cpdef _ndarray_base conjugate(self):
return _math._ndarray_conj(self)
@property
def real(self):
return _math._ndarray_real_getter(self)
@real.setter
def real(self, value):
_math._ndarray_real_setter(self, value)
@property
def imag(self):
return _math._ndarray_imag_getter(self)
@imag.setter
def imag(self, value):
_math._ndarray_imag_setter(self, value)
# -------------------------------------------------------------------------
# Special methods
# -------------------------------------------------------------------------
# For standard library functions:
def __copy__(self):
return self.copy()
def __deepcopy__(self, memo):
# It need to make a contiguous copy for copying from another device
prev_device = runtime.getDevice()
try:
runtime.setDevice(self.device.id)
return self.copy()
finally:
runtime.setDevice(prev_device)
def __reduce__(self):
return array, (self.get(),)
# Basic customization:
# _ndarray_base does not define __new__
def __array__(self, dtype=None):
# TODO(imanishi): Support an environment variable or a global
# configure flag that allows implicit conversions to NumPy array.
# (See https://github.com/cupy/cupy/issues/589 for the detail.)
raise TypeError(
'Implicit conversion to a NumPy array is not allowed. '
'Please use `.get()` to construct a NumPy array explicitly.')
@classmethod
def __class_getitem__(cls, tuple item):
from cupy.typing._generic_alias import GenericAlias
item1, item2 = item
return GenericAlias(cupy.ndarray, (item1, item2))
# TODO(okuta): Implement __array_wrap__
# Container customization:
def __iter__(self):
if self._shape.size() == 0:
raise TypeError('iteration over a 0-d array')
return (self[i] for i in range(self._shape[0]))
def __len__(self):
if self._shape.size() == 0:
raise TypeError('len() of unsized object')
return self._shape[0]
def __getitem__(self, slices):
"""x.__getitem__(y) <==> x[y]
Supports both basic and advanced indexing.
.. note::
Currently, it does not support ``slices`` that consists of more
than one boolean arrays
.. note::
CuPy handles out-of-bounds indices differently from NumPy.
NumPy handles them by raising an error, but CuPy wraps around them.
Example:
>>> a = cupy.arange(3)
>>> a[[1, 3]]
array([1, 0])
"""
return _indexing._ndarray_getitem(self, slices)
def __setitem__(self, slices, value):
"""x.__setitem__(slices, y) <==> x[slices] = y
Supports both basic and advanced indexing.
.. note::
Currently, it does not support ``slices`` that consists of more
than one boolean arrays
.. note::
CuPy handles out-of-bounds indices differently from NumPy when
using integer array indexing.
NumPy handles them by raising an error, but CuPy wraps around them.
>>> import cupy
>>> x = cupy.arange(3)
>>> x[[1, 3]] = 10
>>> x
array([10, 10, 2])
.. note::
The behavior differs from NumPy when integer arrays in ``slices``
reference the same location multiple times.
In that case, the value that is actually stored is undefined.
>>> import cupy
>>> a = cupy.zeros((2,))
>>> i = cupy.arange(10000) % 2
>>> v = cupy.arange(10000).astype(cupy.float_)
>>> a[i] = v
>>> a # doctest: +SKIP
array([9150., 9151.])
On the other hand, NumPy stores the value corresponding to the
last index among the indices referencing duplicate locations.
>>> import numpy
>>> a_cpu = numpy.zeros((2,))
>>> i_cpu = numpy.arange(10000) % 2
>>> v_cpu = numpy.arange(10000).astype(numpy.float_)
>>> a_cpu[i_cpu] = v_cpu
>>> a_cpu
array([9998., 9999.])
"""
if _util.ENABLE_SLICE_COPY and (
type(slices) is slice
and slices == slice(None, None, None)
and isinstance(value, numpy.ndarray)
):
if (self.dtype == value.dtype
and self.shape == value.shape
and (self._f_contiguous or self._c_contiguous)):
order = 'F' if self._f_contiguous else 'C'
tmp = value.ravel(order)
ptr = tmp.ctypes.data
stream_ptr = stream_module.get_current_stream_ptr()
if stream_ptr == 0:
self.data.copy_from_host(ptr, self.nbytes)
else:
self.data.copy_from_host_async(ptr, self.nbytes)
else:
raise ValueError(
'copying a numpy.ndarray to a cupy.ndarray by empty slice '
'assignment must ensure arrays have same shape and dtype')
else:
_indexing._ndarray_setitem(self, slices, value)
def scatter_add(self, slices, value):
"""Adds given values to specified elements of an array.
.. seealso::
:func:`cupyx.scatter_add` for full documentation.
"""
warnings.warn(
'`ndarray.scatter_add` is deprecated. '
'Please use `cupy.add.at` instead.',
DeprecationWarning)
self._scatter_op(slices, value, 'add')
def scatter_max(self, slices, value):
"""Stores a maximum value of elements specified by indices to an array.
.. seealso::
:func:`cupyx.scatter_max` for full documentation.
"""
warnings.warn(
'`ndarray.scatter_max` is deprecated '
'Please use `cupy.maximum.at` instead.',
DeprecationWarning)
self._scatter_op(slices, value, 'max')
def scatter_min(self, slices, value):
"""Stores a minimum value of elements specified by indices to an array.
.. seealso::
:func:`cupyx.scatter_min` for full documentation.
"""
warnings.warn(
'`ndarray.scatter_min` is deprecated '
'Please use `cupy.minimum.at` instead.',
DeprecationWarning)
self._scatter_op(slices, value, 'min')
def _scatter_op(self, slices, value, op):
_indexing._scatter_op(self, slices, value, op)
# TODO(okuta): Implement __getslice__
# TODO(okuta): Implement __setslice__
# TODO(okuta): Implement __contains__
# numpy/ufunc compat
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
"""Apply unary or binary ufunc to this array
If binary, only allow if second argument is another cupy ndarray or
a number, i.e., raise ValueError instead of silently converting a
numpy array.
"""
import cupy # top-level ufuncs
import cupyx.scipy.special # special ufuncs
inout = inputs
if 'out' in kwargs:
# need to unfold tuple argument in kwargs
# TODO(ecastill) GUFuncs support more than one output
out = kwargs['out']
if len(out) != 1:
raise ValueError('The \'out\' parameter must have exactly one '
'array value')
inout += out
kwargs['out'] = out[0]
if method in (
'__call__', 'outer', 'at', 'reduce', 'accumulate', 'reduceat'
):
name = ufunc.__name__
try:
func = getattr(cupy, name, None) or getattr(
cupyx.scipy.special, name
)
if method != '__call__':
func = getattr(func, method)
except AttributeError:
return NotImplemented
for x in inout:
# numpy.ndarray is handled and then TypeError is raised due to
# implicit host-to-device conversion.
# Except for numpy.ndarray, types should be supported by
# `_kernel._preprocess_args`.
check = (hasattr(x, '__cuda_array_interface__')
or hasattr(x, '__cupy_get_ndarray__'))
if runtime._is_hip_environment and isinstance(x, ndarray):
check = True
if (not check
and not type(x) in _scalar.scalar_type_set
and not isinstance(x, numpy.ndarray)):
return NotImplemented
if name in [
'greater', 'greater_equal', 'less', 'less_equal',
'equal', 'not_equal']:
# workaround for numpy/numpy#12142
inputs = tuple([
x.item()
if isinstance(x, numpy.ndarray) and x.ndim == 0
else x
for x in inputs
])
return func(*inputs, **kwargs)
else:
return NotImplemented
def __array_function__(self, func, types, args, kwargs):
try:
module = functools.reduce(
getattr, func.__module__.split('.')[1:], cupy)
cupy_func = getattr(module, func.__name__)
except AttributeError:
return NotImplemented
if cupy_func is func:
# avoid NumPy func
return NotImplemented
for t in types:
for handled_type in _HANDLED_TYPES:
if issubclass(t, handled_type):
break
else:
return NotImplemented
return cupy_func(*args, **kwargs)
# Conversion:
def __int__(self):
return int(self.get())
def __float__(self):
return float(self.get())
def __complex__(self):
return complex(self.get())
def __oct__(self):
return oct(self.get())
def __hex__(self):
return hex(self.get())
def __bytes__(self):
return bytes(self.get())
# String representations:
def __repr__(self):
return repr(self.get())
def __str__(self):
return str(self.get())
def __format__(self, format_spec):
return format(self.get(), format_spec)
# -------------------------------------------------------------------------
# Methods outside of the ndarray main documentation
# -------------------------------------------------------------------------
def dot(self, _ndarray_base b, _ndarray_base out=None):
"""Returns the dot product with given array.
.. seealso::
:func:`cupy.dot` for full documentation,
:meth:`numpy.ndarray.dot`
"""
return _linalg.dot(self, b, out)
# -------------------------------------------------------------------------
# Cupy specific attributes and methods
# -------------------------------------------------------------------------
@property
def device(self):
"""CUDA device on which this array resides."""
return self.data.device
cpdef get(self, stream=None, order='C', out=None):
"""Returns a copy of the array on host memory.
Args:
stream (cupy.cuda.Stream): CUDA stream object. If it is given, the
copy runs asynchronously. Otherwise, the copy is synchronous.
The default uses CUDA stream object of the current context.
order ({'C', 'F', 'A'}): The desired memory layout of the host
array. When ``order`` is 'A', it uses 'F' if the array is
fortran-contiguous and 'C' otherwise. The ``order`` will be
ignored if ``out`` is specified.
out (numpy.ndarray): Output array. In order to enable asynchronous
copy, the underlying memory should be a pinned memory.
Returns:
numpy.ndarray: Copy of the array on host memory.
"""
if out is not None:
if not isinstance(out, numpy.ndarray):
raise TypeError('Only numpy.ndarray can be obtained from'
'cupy.ndarray')
if self.dtype != out.dtype:
raise TypeError(
'{} array cannot be obtained from {} array'.format(
out.dtype, self.dtype))
if self.shape != out.shape:
raise ValueError(
'Shape mismatch. Expected shape: {}, '
'actual shape: {}'.format(self.shape, out.shape))
if not (out.flags.c_contiguous and self._c_contiguous or
out.flags.f_contiguous and self._f_contiguous):
prev_device = runtime.getDevice()
try:
runtime.setDevice(self.device.id)
if out.flags.c_contiguous:
a_gpu = _internal_ascontiguousarray(self)
elif out.flags.f_contiguous:
a_gpu = _internal_asfortranarray(self)
else:
raise RuntimeError(
'`out` cannot be specified when copying to '
'non-contiguous ndarray')
finally:
runtime.setDevice(prev_device)
else:
a_gpu = self
a_cpu = out
else:
if self.size == 0:
return numpy.ndarray(self._shape, dtype=self.dtype)
order = order.upper()
if order == 'A':
if self._f_contiguous:
order = 'F'
else:
order = 'C'
if not (order == 'C' and self._c_contiguous or
order == 'F' and self._f_contiguous):
prev_device = runtime.getDevice()
try:
runtime.setDevice(self.device.id)
if order == 'C':
a_gpu = _internal_ascontiguousarray(self)
elif order == 'F':
a_gpu = _internal_asfortranarray(self)
else:
raise ValueError('unsupported order: {}'.format(order))
finally:
runtime.setDevice(prev_device)
else:
a_gpu = self
a_cpu = numpy.empty(self._shape, dtype=self.dtype, order=order)
syncdetect._declare_synchronize()
ptr = a_cpu.ctypes.data
prev_device = runtime.getDevice()
try:
runtime.setDevice(self.device.id)
if stream is not None:
a_gpu.data.copy_to_host_async(ptr, a_gpu.nbytes, stream)
else:
stream_ptr = stream_module.get_current_stream_ptr()
if stream_ptr == 0:
a_gpu.data.copy_to_host(ptr, a_gpu.nbytes)
else:
a_gpu.data.copy_to_host_async(ptr, a_gpu.nbytes)
finally:
runtime.setDevice(prev_device)
return a_cpu
cpdef set(self, arr, stream=None):
"""Copies an array on the host memory to :class:`cupy.ndarray`.
Args:
arr (numpy.ndarray): The source array on the host memory.
stream (cupy.cuda.Stream): CUDA stream object. If it is given, the
copy runs asynchronously. Otherwise, the copy is synchronous.
The default uses CUDA stream object of the current context.
"""
if not isinstance(arr, numpy.ndarray):
raise TypeError('Only numpy.ndarray can be set to cupy.ndarray')
if self.dtype != arr.dtype:
raise TypeError('{} array cannot be set to {} array'.format(
arr.dtype, self.dtype))
if self.shape != arr.shape:
raise ValueError(
'Shape mismatch. Old shape: {}, new shape: {}'.format(
self.shape, arr.shape))
if self._c_contiguous:
arr = numpy.ascontiguousarray(arr)
elif self._f_contiguous:
arr = numpy.asfortranarray(arr)
else:
raise RuntimeError('Cannot set to non-contiguous array')
ptr = arr.ctypes.data
prev_device = runtime.getDevice()
try:
runtime.setDevice(self.device.id)
if stream is not None:
self.data.copy_from_host_async(ptr, self.nbytes, stream)
else:
stream_ptr = stream_module.get_current_stream_ptr()
if stream_ptr == 0:
self.data.copy_from_host(ptr, self.nbytes)
else:
self.data.copy_from_host_async(ptr, self.nbytes)
finally:
runtime.setDevice(prev_device)
cpdef _ndarray_base reduced_view(self, dtype=None):
"""Returns a view of the array with minimum number of dimensions.
Args:
dtype: (Deprecated) Data type specifier.
If it is given, then the memory
sequence is reinterpreted as the new type.
Returns:
cupy.ndarray: A view of the array with reduced dimensions.
"""
cdef shape_t shape
cdef strides_t strides
cdef Py_ssize_t ndim
cdef _ndarray_base view
if dtype is not None:
warnings.warn(
'calling reduced_view with dtype is deprecated',
DeprecationWarning)
return self.reduced_view().view(dtype)
ndim = self._shape.size()
if ndim <= 1:
return self
if self._c_contiguous:
view = self.view()
view._shape.assign(1, self.size)
view._strides.assign(1, self.dtype.itemsize)
view._update_f_contiguity()
return view
internal.get_reduced_dims(
self._shape, self._strides, self.dtype.itemsize, shape, strides)
if ndim == <Py_ssize_t>shape.size():
return self
# TODO(niboshi): Confirm update_x_contiguity flags
return self._view(type(self), shape, strides, False, True, self)
cpdef _update_c_contiguity(self):
if self.size == 0:
self._c_contiguous = True
return
self._c_contiguous = internal.get_c_contiguity(
self._shape, self._strides, self.dtype.itemsize)
cpdef _update_f_contiguity(self):
cdef Py_ssize_t i, count
cdef shape_t rev_shape
cdef strides_t rev_strides
if self.size == 0:
self._f_contiguous = True
return
if self._c_contiguous:
count = 0
for i in self._shape:
if i == 1:
count += 1
self._f_contiguous = (<Py_ssize_t>self._shape.size()) - count <= 1
return
rev_shape.assign(self._shape.rbegin(), self._shape.rend())
rev_strides.assign(self._strides.rbegin(), self._strides.rend())
self._f_contiguous = internal.get_c_contiguity(
rev_shape, rev_strides, self.dtype.itemsize)
cpdef _update_contiguity(self):
self._update_c_contiguity()
self._update_f_contiguity()
cpdef _set_shape_and_strides(self, const shape_t& shape,
const strides_t& strides,
bint update_c_contiguity,
bint update_f_contiguity):
if shape.size() != strides.size():
raise ValueError('len(shape) != len(strides)')
if shape.size() > _carray.MAX_NDIM:
msg = 'maximum supported dimension for an ndarray is '
msg += f'{_carray.MAX_NDIM}, found {shape.size()}'
raise ValueError(msg)
self._shape = shape
self._strides = strides
self.size = internal.prod(shape)
if update_c_contiguity:
self._update_c_contiguity()
if update_f_contiguity:
self._update_f_contiguity()
cdef _ndarray_base _view(self, subtype, const shape_t& shape,
const strides_t& strides,
bint update_c_contiguity,
bint update_f_contiguity, obj):
cdef _ndarray_base v
# Use `_no_init=True` to skip recomputation of contiguity. Now
# calling `__array_finalize__` is responsibility of this method.`
v = ndarray.__new__(subtype, _obj=obj, _no_init=True)
v.data = self.data
v.base = self.base if self.base is not None else self
v.dtype = self.dtype
v._c_contiguous = self._c_contiguous
v._f_contiguous = self._f_contiguous
v._index_32_bits = self._index_32_bits
v._set_shape_and_strides(
shape, strides, update_c_contiguity, update_f_contiguity)
if subtype is not ndarray:
v.__array_finalize__(self)
return v
cpdef _set_contiguous_strides(
self, Py_ssize_t itemsize, bint is_c_contiguous):
self.size = internal.get_contiguous_strides_inplace(
self._shape, self._strides, itemsize, is_c_contiguous, True)
if is_c_contiguous:
self._c_contiguous = True
self._update_f_contiguity()
else:
self._f_contiguous = True
self._update_c_contiguity()
cdef function.CPointer get_pointer(self):
return _CArray_from_ndarray(self)
cpdef object toDlpack(self):
"""Zero-copy conversion to a DLPack tensor.
DLPack is a open in memory tensor structure proposed in this
repository: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.
This function returns a :class:`PyCapsule` object which contains a
pointer to a DLPack tensor converted from the own ndarray. This
function does not copy the own data to the output DLpack tensor
but it shares the pointer which is pointing to the same memory region
for the data.
Returns:
dltensor (:class:`PyCapsule`): Output DLPack tensor which is
encapsulated in a :class:`PyCapsule` object.
.. seealso::
:meth:`~cupy.fromDlpack` is a method for zero-copy conversion from
a DLPack tensor (which is encapsulated in a :class:`PyCapsule`
object) to a :class:`ndarray`
.. warning::
As of the DLPack v0.3 specification, it is (implicitly) assumed
that the user is responsible to ensure the Producer and the
Consumer are operating on the same stream. This requirement might
be relaxed/changed in a future DLPack version.
.. admonition:: Example
>>> import cupy
>>> array1 = cupy.array([0, 1, 2], dtype=cupy.float32)
>>> dltensor = array1.toDlpack()
>>> array2 = cupy.fromDlpack(dltensor)
>>> cupy.testing.assert_array_equal(array1, array2)
"""
return dlpack.toDlpack(self)
cdef inline _carray.CArray _CArray_from_ndarray(_ndarray_base arr):
# Creates CArray from ndarray.
# Note that this function cannot be defined in _carray.pxd because that
# would cause cyclic cimport dependencies.
cdef _carray.CArray carr = _carray.CArray.__new__(_carray.CArray)
carr.init(<void*>arr.data.ptr, arr.size, arr._shape, arr._strides)
return carr
_HANDLED_TYPES = (ndarray, numpy.ndarray)
# =============================================================================
# compile_with_cache
# =============================================================================
# TODO(niboshi): Move it out of core.pyx
cdef bint _is_hip = runtime._is_hip_environment
cdef int _cuda_runtime_version = -1
cdef str _cuda_path = '' # '' for uninitialized, None for non-existing
cdef list cupy_header_list = [
'cupy/complex.cuh',
'cupy/carray.cuh',
'cupy/atomics.cuh',
'cupy/math_constants.h',
]
if _is_hip:
cupy_header_list.append('cupy/hip_workaround.cuh')
# expose to Python for unit testing
_cupy_header_list = cupy_header_list
cdef str _cupy_header = ''.join(
['#include <%s>\n' % i for i in cupy_header_list])
# This is indirect include header list.
# These header files are subject to a hash key.
cdef list _cupy_extra_header_list = [
'cupy/complex/complex.h',
'cupy/complex/math_private.h',
'cupy/complex/complex_inl.h',
'cupy/complex/arithmetic.h',
'cupy/complex/cproj.h',
'cupy/complex/cexp.h',
'cupy/complex/cexpf.h',
'cupy/complex/clog.h',
'cupy/complex/clogf.h',
'cupy/complex/cpow.h',
'cupy/complex/ccosh.h',
'cupy/complex/ccoshf.h',
'cupy/complex/csinh.h',
'cupy/complex/csinhf.h',
'cupy/complex/ctanh.h',
'cupy/complex/ctanhf.h',
'cupy/complex/csqrt.h',
'cupy/complex/csqrtf.h',
'cupy/complex/catrig.h',
'cupy/complex/catrigf.h',
'cupy/swap.cuh',
'cupy/tuple/type_traits.h',
'cupy/tuple/tuple.h',
'cupy/tuple.cuh',
]
cdef str _header_path_cache = None
cdef str _header_source = None
cdef dict _header_source_map = {}
cpdef str _get_header_dir_path():
global _header_path_cache
if _header_path_cache is None:
# Cython cannot use __file__ in global scope
_header_path_cache = os.path.abspath(
os.path.join(os.path.dirname(__file__), 'include'))
return _header_path_cache
cpdef str _get_header_source():
global _header_source
global _header_source_map
cdef str header_path, base_path, file_path, header
cdef list source
if _header_source is None or not _header_source_map:
source = []
base_path = _get_header_dir_path()
for file_path in _cupy_extra_header_list + cupy_header_list:
header_path = os.path.join(base_path, file_path)
with open(header_path) as header_file:
header = header_file.read()
source.append(header)
_header_source_map[file_path.encode()] = header.encode()
_header_source = '\n'.join(source)
return _header_source
cpdef dict _get_header_source_map():
global _header_source_map
if not _header_source_map:
_get_header_source()
return _header_source_map
# added at the module level for precompiling the regex
_cucomplex_include_tokens = ['', '#', 'include', '<', r'cuComplex\.h', '>']
_cucomplex_include_pattern = re.compile(r'\s*'.join(_cucomplex_include_tokens))
cdef inline str _translate_cucomplex_to_thrust(str source):
lines = []
for line in source.splitlines(keepends=True):
if _cucomplex_include_pattern.match(line):
lines += '#include <cupy/cuComplex_bridge.h> '\
'// translate_cucomplex\n'
else:
lines += line
return ''.join(lines)
cpdef function.Module compile_with_cache(
str source, tuple options=(), arch=None, cachd_dir=None,
prepend_cupy_headers=True, backend='nvrtc', translate_cucomplex=False,
enable_cooperative_groups=False, name_expressions=None,
log_stream=None, bint jitify=False):
if translate_cucomplex:
source = _translate_cucomplex_to_thrust(source)
cupy_header_list.append('cupy/cuComplex_bridge.h')
prepend_cupy_headers = True
if prepend_cupy_headers:
source = _cupy_header + source
extra_source = _get_header_source()
options += ('-I%s' % _get_header_dir_path(),)
# The variable _cuda_runtime_version is declared in cupy/_core/core.pyx,
# but it might not have been set appropriately before coming here.
global _cuda_runtime_version
if _cuda_runtime_version < 0:
_cuda_runtime_version = runtime.runtimeGetVersion()
global _cuda_path
if _cuda_path == '':
if not _is_hip:
_cuda_path = cuda.get_cuda_path()
else:
_cuda_path = cuda.get_rocm_path()
if not _is_hip:
if 10020 <= _cuda_runtime_version < 10030:
bundled_include = 'cuda-10.2'
elif 11000 <= _cuda_runtime_version < 11010:
bundled_include = 'cuda-11.0'
elif 11010 <= _cuda_runtime_version < 11020:
bundled_include = 'cuda-11.1'
elif 11020 <= _cuda_runtime_version < 12000:
# CUDA Enhanced Compatibility
bundled_include = 'cuda-11'
elif 12000 <= _cuda_runtime_version < 13000:
# CUDA Enhanced Compatibility
bundled_include = 'cuda-12'
else:
# CUDA versions not yet supported.
bundled_include = None
if bundled_include is None and _cuda_path is None:
raise RuntimeError(
'Failed to auto-detect CUDA root directory. '
'Please specify `CUDA_PATH` environment variable if you '
'are using CUDA versions not yet supported by CuPy.')
if bundled_include is not None:
options += ('-I' + os.path.join(
_get_header_dir_path(), 'cupy', '_cuda', bundled_include),)
elif _is_hip:
if _cuda_path is None:
raise RuntimeError(
'Failed to auto-detect ROCm root directory. '
'Please specify `ROCM_HOME` environment variable.')
if _cuda_path is not None:
options += ('-I' + os.path.join(_cuda_path, 'include'),)
return cuda.compiler._compile_module_with_cache(
source, options, arch, cachd_dir, extra_source, backend,
enable_cooperative_groups=enable_cooperative_groups,
name_expressions=name_expressions, log_stream=log_stream,
jitify=jitify)
# =============================================================================
# Routines
# =============================================================================
cdef str _id = 'out0 = in0'
cdef fill_kernel = ElementwiseKernel('T x', 'T y', 'y = x', 'cupy_fill')
cdef str _divmod_float = '''
out0_type a = _floor_divide(in0, in1);
out0 = a;
out1 = in0 - a * in1'''
divmod = create_ufunc(
'cupy_divmod',
('bb->bb', 'BB->BB', 'hh->hh', 'HH->HH', 'ii->ii', 'II->II', 'll->ll',
'LL->LL', 'qq->qq', 'QQ->QQ',
('ee->ee', _divmod_float),
('ff->ff', _divmod_float),
('dd->dd', _divmod_float)),
'''
if (in1 == 0) {
out0 = 0;
out1 = 0;
} else {
out0_type a = _floor_divide(in0, in1);
out0 = a;
out1 = in0 - a * in1;
}''')
cdef _round_preamble = '''
#ifdef __HIP_DEVICE_COMPILE__
#define round_float llrintf
#else
#define round_float __float2ll_rn
#endif
template<typename T> __device__ T pow10(long long n){
T x = 1, a = 10;
while (n) {
if (n & 1) x *= a;
a *= a;
n >>= 1;
}
return x;
};
'''
cdef _round_float = '''
if (in1 == 0) {
out0 = rint(in0);
} else {
double x;
x = pow10<double>(abs(in1)); // TODO(okuta): Move before loop
out0 = in1 < 0 ? rint(in0 / x) * x : rint(in0 * x) / x;
}'''
cdef _round_complex = '''
if (in1 == 0) {
out0 = in0_type(rint(in0.real()), rint(in0.imag()));
} else {
double x = pow10<double>(abs(in1)); // TODO(okuta): Move before loop
if (in1 < 0) {
out0 = in0_type(rint(in0.real() / x) * x,
rint(in0.imag() / x) * x);
} else {
out0 = in0_type(rint(in0.real() * x) / x,
rint(in0.imag() * x) / x);
}
}'''
# There is a known incompatibility with NumPy (as of 1.16.4) such as
# `numpy.around(2**63, -1) == cupy.around(2**63, -1)` gives `False`.
#
# NumPy seems to round integral values via double. As double has
# only 53 bit precision, last few bits of (u)int64 value may be lost.
# As a consequence, `numpy.around(2**63, -1)` does NOT round up the
# last digit (9223372036854775808 instead of ...810).
#
# The following code fixes the problem, so `cupy.around(2**63, -1)`
# gives `...810`, which (may correct but) is incompatible with NumPy.
_round_ufunc = create_ufunc(
'cupy_round',
('?q->e',
'bq->b', 'Bq->B', 'hq->h', 'Hq->H', 'iq->i', 'Iq->I', 'lq->l', 'Lq->L',
'qq->q', 'Qq->Q',
('eq->e', _round_float),
('fq->f', _round_float),
('dq->d', _round_float),
('Fq->F', _round_complex),
('Dq->D', _round_complex)),
'''
if (in1 >= 0) {
out0 = in0;
} else {
// TODO(okuta): Move before loop
long long x = pow10<long long>(-in1 - 1);
// TODO(okuta): Check Numpy
// `cupy.around(-123456789, -4)` works as follows:
// (1) scale by `x` above: -123456.789
// (2) split at the last 2 digits: -123400 + (-5.6789 * 10)
// (3) round the latter by `rint()`: -123400 + (-6.0 * 10)
// (4) unscale by `x` above: -123460000
long long q = in0 / x / 100;
int r = in0 - q*x*100;
out0 = (q*100 + round_float(r/(x*10.0f))*10) * x;
}''', preamble=_round_preamble)
# -----------------------------------------------------------------------------
# Array creation routines
# -----------------------------------------------------------------------------
cpdef _ndarray_base array(obj, dtype=None, bint copy=True, order='K',
bint subok=False, Py_ssize_t ndmin=0):
# TODO(beam2d): Support subok options
if subok:
raise NotImplementedError
if order is None:
order = 'K'
if isinstance(obj, ndarray):
return _array_from_cupy_ndarray(obj, dtype, copy, order, ndmin)
if hasattr(obj, '__cuda_array_interface__'):
return _array_from_cuda_array_interface(
obj, dtype, copy, order, subok, ndmin)
if hasattr(obj, '__cupy_get_ndarray__'):
return _array_from_cupy_ndarray(
obj.__cupy_get_ndarray__(), dtype, copy, order, ndmin)
concat_shape, concat_type, concat_dtype = (
_array_info_from_nested_sequence(obj))
if concat_shape is not None:
return _array_from_nested_sequence(
obj, dtype, order, ndmin, concat_shape, concat_type, concat_dtype)
return _array_default(obj, dtype, order, ndmin)
cdef _ndarray_base _array_from_cupy_ndarray(
obj, dtype, bint copy, order, Py_ssize_t ndmin):
cdef Py_ssize_t ndim
cdef _ndarray_base a, src
src = obj
if dtype is None:
dtype = src.dtype
if src.data.device_id == device.get_device_id():
a = src.astype(dtype, order=order, copy=copy)
else:
a = src.copy(order=order).astype(dtype, copy=False)
ndim = a._shape.size()
if ndmin > ndim:
if a is obj:
# When `copy` is False, `a` is same as `obj`.
a = a.view()
a.shape = (1,) * (ndmin - ndim) + a.shape
return a
cdef _ndarray_base _array_from_cuda_array_interface(
obj, dtype, bint copy, order, bint subok, Py_ssize_t ndmin):
return array(
_convert_object_with_cuda_array_interface(obj),
dtype, copy, order, subok, ndmin)
cdef _ndarray_base _array_from_nested_sequence(
obj, dtype, order, Py_ssize_t ndmin, concat_shape, concat_type,
concat_dtype):
cdef Py_ssize_t ndim
# resulting array is C order unless 'F' is explicitly specified
# (i.e., it ignores order of element arrays in the sequence)
order = (
'F'
if order is not None and len(order) >= 1 and order[0] in 'Ff'
else 'C')
ndim = len(concat_shape)
if ndmin > ndim:
concat_shape = (1,) * (ndmin - ndim) + concat_shape
if dtype is None:
dtype = concat_dtype.newbyteorder('<')
if concat_type is numpy.ndarray:
return _array_from_nested_numpy_sequence(
obj, concat_dtype, dtype, concat_shape, order, ndmin)
elif concat_type is ndarray: # TODO(takagi) Consider subclases
return _array_from_nested_cupy_sequence(
obj, dtype, concat_shape, order)
else:
assert False
cdef _ndarray_base _array_from_nested_numpy_sequence(
arrays, src_dtype, dst_dtype, const shape_t& shape, order,
Py_ssize_t ndmin):
a_dtype = get_dtype(dst_dtype) # convert to numpy.dtype
if a_dtype.char not in '?bhilqBHILQefdFD':
raise ValueError('Unsupported dtype %s' % a_dtype)
cdef _ndarray_base a # allocate it after pinned memory is secured
cdef size_t itemcount = internal.prod(shape)
cdef size_t nbytes = itemcount * a_dtype.itemsize
stream = stream_module.get_current_stream()
# Note: even if arrays are already backed by pinned memory, we still need
# to allocate an extra buffer and copy from it to avoid potential data
# race, see the discussion here:
# https://github.com/cupy/cupy/pull/5155#discussion_r621808782
cdef pinned_memory.PinnedMemoryPointer mem = (
_alloc_async_transfer_buffer(nbytes))
if mem is not None:
# write concatenated arrays to the pinned memory directly
src_cpu = (
numpy.frombuffer(mem, a_dtype, itemcount)
.reshape(shape, order=order))
_concatenate_numpy_array(
[numpy.expand_dims(e, 0) for e in arrays],
0,
get_dtype(src_dtype),
a_dtype,
src_cpu)
a = ndarray(shape, dtype=a_dtype, order=order)
a.data.copy_from_host_async(mem.ptr, nbytes)
pinned_memory._add_to_watch_list(stream.record(), mem)
else:
# fallback to numpy array and send it to GPU
# Note: a_cpu.ndim is always >= 1
a_cpu = numpy.array(arrays, dtype=a_dtype, copy=False, order=order,
ndmin=ndmin)
a = ndarray(shape, dtype=a_dtype, order=order)
a.data.copy_from_host(a_cpu.ctypes.data, nbytes)
return a
cdef _ndarray_base _array_from_nested_cupy_sequence(obj, dtype, shape, order):
lst = _flatten_list(obj)
# convert each scalar (0-dim) ndarray to 1-dim
lst = [cupy.expand_dims(x, 0) if x.ndim == 0 else x for x in lst]
a = _manipulation.concatenate_method(lst, 0)
a = a.reshape(shape)
a = a.astype(dtype, order=order, copy=False)
return a
cdef _ndarray_base _array_default(obj, dtype, order, Py_ssize_t ndmin):
if order is not None and len(order) >= 1 and order[0] in 'KAka':
if isinstance(obj, numpy.ndarray) and obj.flags.fnc:
order = 'F'
else:
order = 'C'
a_cpu = numpy.array(obj, dtype=dtype, copy=False, order=order,
ndmin=ndmin)
if a_cpu.dtype.char not in '?bhilqBHILQefdFD':
raise ValueError('Unsupported dtype %s' % a_cpu.dtype)
a_cpu = a_cpu.astype(a_cpu.dtype.newbyteorder('<'), copy=False)
a_dtype = a_cpu.dtype
cdef shape_t a_shape = a_cpu.shape
cdef _ndarray_base a = ndarray(a_shape, dtype=a_dtype, order=order)
if a_cpu.ndim == 0:
a.fill(a_cpu)
return a
cdef Py_ssize_t nbytes = a.nbytes
stream = stream_module.get_current_stream()
# Note: even if obj is already backed by pinned memory, we still need to
# allocate an extra buffer and copy from it to avoid potential data race,
# see the discussion here:
# https://github.com/cupy/cupy/pull/5155#discussion_r621808782
cdef pinned_memory.PinnedMemoryPointer mem = (
_alloc_async_transfer_buffer(nbytes))
if mem is not None:
src_cpu = numpy.frombuffer(mem, a_dtype, a_cpu.size)
src_cpu[:] = a_cpu.ravel(order)
a.data.copy_from_host_async(mem.ptr, nbytes)
pinned_memory._add_to_watch_list(stream.record(), mem)
else:
a.data.copy_from_host(a_cpu.ctypes.data, nbytes)
return a
cdef tuple _array_info_from_nested_sequence(obj):
# Returns a tuple containing information if we can simply concatenate the
# input to make a CuPy array (i.e., a (nested) sequence that only contains
# NumPy/CuPy arrays with the same shape and dtype). `(None, None, None)`
# means we do not concatenate the input.
# 1. A concatenated shape
# 2. The type of the arrays to concatenate (numpy.ndarray or cupy.ndarray)
# 3. The dtype of the arrays to concatenate
if isinstance(obj, (list, tuple)):
return _compute_concat_info_impl(obj)
else:
return None, None, None
cdef tuple _compute_concat_info_impl(obj):
cdef Py_ssize_t dim
if isinstance(obj, (numpy.ndarray, ndarray)):
return obj.shape, type(obj), obj.dtype
if hasattr(obj, '__cupy_get_ndarray__'):
return obj.shape, ndarray, obj.dtype
if isinstance(obj, (list, tuple)):
dim = len(obj)
if dim == 0:
return None, None, None
concat_shape, concat_type, concat_dtype = (
_compute_concat_info_impl(obj[0]))
if concat_shape is None:
return None, None, None
for elem in obj[1:]:
concat_shape1, concat_type1, concat_dtype1 = (
_compute_concat_info_impl(elem))
if concat_shape1 is None:
return None, None, None
if concat_shape != concat_shape1:
return None, None, None
if concat_type is not concat_type1:
return None, None, None
if concat_dtype != concat_dtype1:
concat_dtype = numpy.promote_types(concat_dtype, concat_dtype1)
return (dim,) + concat_shape, concat_type, concat_dtype
return None, None, None
cdef list _flatten_list(object obj):
ret = []
if isinstance(obj, (list, tuple)):
for elem in obj:
ret += _flatten_list(elem)
return ret
return [obj]
cdef bint _numpy_concatenate_has_out_argument = (
numpy.lib.NumpyVersion(numpy.__version__) >= '1.14.0')
cdef inline _concatenate_numpy_array(arrays, axis, src_dtype, dst_dtype, out):
# type(*_dtype) must be numpy.dtype
if (_numpy_concatenate_has_out_argument
and src_dtype.kind == dst_dtype.kind):
# concatenate only accepts same_kind casting
numpy.concatenate(arrays, axis, out)
else:
out[:] = numpy.concatenate(arrays, axis)
cdef inline _alloc_async_transfer_buffer(Py_ssize_t nbytes):
try:
return pinned_memory.alloc_pinned_memory(nbytes)
except CUDARuntimeError as e:
if e.status != runtime.errorMemoryAllocation:
raise
warnings.warn(
'Using synchronous transfer as pinned memory ({} bytes) '
'could not be allocated. '
'This generally occurs because of insufficient host memory. '
'The original error was: {}'.format(nbytes, e),
_util.PerformanceWarning)
return None
cpdef _ndarray_base _internal_ascontiguousarray(_ndarray_base a):
if a._c_contiguous:
return a
newarray = _ndarray_init(ndarray, a._shape, a.dtype, None)
elementwise_copy(a, newarray)
return newarray
cpdef _ndarray_base _internal_asfortranarray(_ndarray_base a):
cdef _ndarray_base newarray
cdef int m, n
cdef intptr_t handle
if a._f_contiguous:
return a
newarray = ndarray(a.shape, a.dtype, order='F')
if (a._c_contiguous and a._shape.size() == 2 and
(a.dtype == numpy.float32 or a.dtype == numpy.float64)):
m, n = a.shape
handle = device.get_cublas_handle()
one = numpy.array(1, dtype=a.dtype)
zero = numpy.array(0, dtype=a.dtype)
if a.dtype == numpy.float32:
cublas.sgeam(
handle,
1, # transpose a
1, # transpose newarray
m, n, one.ctypes.data, a.data.ptr, n,
zero.ctypes.data, a.data.ptr, n, newarray.data.ptr, m)
elif a.dtype == numpy.float64:
cublas.dgeam(
handle,
1, # transpose a
1, # transpose newarray
m, n, one.ctypes.data, a.data.ptr, n,
zero.ctypes.data, a.data.ptr, n, newarray.data.ptr, m)
else:
elementwise_copy(a, newarray)
return newarray
cpdef _ndarray_base ascontiguousarray(_ndarray_base a, dtype=None):
cdef bint same_dtype = False
zero_dim = a._shape.size() == 0
if dtype is None:
same_dtype = True
dtype = a.dtype
else:
dtype = get_dtype(dtype)
same_dtype = dtype == a.dtype
if same_dtype and a._c_contiguous:
if zero_dim:
return _manipulation._ndarray_ravel(a, 'C')
return a
shape = (1,) if zero_dim else a.shape
newarray = ndarray(shape, dtype)
elementwise_copy(a, newarray)
return newarray
cpdef _ndarray_base asfortranarray(_ndarray_base a, dtype=None):
cdef _ndarray_base newarray
cdef bint same_dtype = False
zero_dim = a._shape.size() == 0
if dtype is None:
dtype = a.dtype
same_dtype = True
else:
dtype = get_dtype(dtype)
same_dtype = dtype == a.dtype
if same_dtype and a._f_contiguous:
if zero_dim:
return _manipulation._ndarray_ravel(a, 'F')
return a
if same_dtype and not zero_dim:
return _internal_asfortranarray(a)
newarray = ndarray((1,) if zero_dim else a.shape, dtype, order='F')
elementwise_copy(a, newarray)
return newarray
cpdef _ndarray_base _convert_object_with_cuda_array_interface(a):
if runtime._is_hip_environment:
raise RuntimeError(
'HIP/ROCm does not support cuda array interface')
cdef Py_ssize_t sh, st
cdef dict desc = a.__cuda_array_interface__
cdef tuple shape = desc['shape']
cdef int dev_id = -1
cdef size_t nbytes
ptr = desc['data'][0]
dtype = numpy.dtype(desc['typestr'])
if dtype.byteorder == '>':
raise ValueError('CuPy does not support the big-endian byte-order')
mask = desc.get('mask')
if mask is not None:
raise ValueError('CuPy currently does not support masked arrays.')
strides = desc.get('strides')
if strides is not None:
nbytes = 0
for sh, st in zip(shape, strides):
nbytes = max(nbytes, abs(sh * st))
else:
nbytes = internal.prod_sequence(shape) * dtype.itemsize
# the v2 protocol sets ptr=0 for 0-size arrays, so we can't look up
# the pointer attributes and must use the current device
if nbytes == 0:
dev_id = device.get_device_id()
mem = memory_module.UnownedMemory(ptr, nbytes, a, dev_id)
memptr = memory.MemoryPointer(mem, 0)
# the v3 protocol requires an immediate synchronization, unless
# 1. the stream is not set (ex: from v0 ~ v2) or is None
# 2. users explicitly overwrite this requirement
stream_ptr = desc.get('stream')
if stream_ptr is not None:
if _util.CUDA_ARRAY_INTERFACE_SYNC:
runtime.streamSynchronize(stream_ptr)
return ndarray(shape, dtype, memptr, strides)
cdef _ndarray_base _ndarray_init(subtype, const shape_t& shape, dtype, obj):
# Use `_no_init=True` for fast init. Now calling `__array_finalize__` is
# responsibility of this function.
cdef _ndarray_base ret = ndarray.__new__(subtype, _obj=obj, _no_init=True)
ret._init_fast(shape, dtype, True)
if subtype is not ndarray:
ret.__array_finalize__(obj)
return ret
cdef _ndarray_base _create_ndarray_from_shape_strides(
subtype, const shape_t& shape, const strides_t& strides, dtype, obj):
cdef int ndim = shape.size()
cdef int64_t begin = 0, end = dtype.itemsize
cdef memory.MemoryPointer ptr
for i in range(ndim):
if strides[i] > 0:
end += strides[i] * (shape[i] - 1)
elif strides[i] < 0:
begin += strides[i] * (shape[i] - 1)
ptr = memory.alloc(end - begin) + begin
return ndarray.__new__(
subtype, shape, dtype, _obj=obj, memptr=ptr, strides=strides)
cpdef min_scalar_type(a):
"""
For scalar ``a``, returns the data type with the smallest size
and smallest scalar kind which can hold its value. For non-scalar
array ``a``, returns the vector's dtype unmodified.
.. seealso:: :func:`numpy.min_scalar_type`
"""
if isinstance(a, ndarray):
return a.dtype
_, concat_type, concat_dtype = _array_info_from_nested_sequence(a)
if concat_type is not None:
return concat_dtype
return numpy.min_scalar_type(a)
from cupy._core.core cimport _ndarray_base
cdef extern from './include/cupy/dlpack/dlpack.h' nogil:
int device_CUDA 'kDLCUDA'
int managed_CUDA 'kDLCUDAManaged'
int device_ROCM 'kDLROCM'
cpdef object toDlpack(_ndarray_base array) except +
cpdef _ndarray_base fromDlpack(object dltensor) except +
cpdef from_dlpack(array)
cimport cpython # NOQA
from libc cimport stdlib
from libc.stdint cimport uint8_t
from libc.stdint cimport uint16_t
from libc.stdint cimport int32_t
from libc.stdint cimport int64_t
from libc.stdint cimport uint64_t
from libc.stdint cimport intptr_t
from libcpp.vector cimport vector
from cupy_backends.cuda.api cimport runtime
from cupy_backends.cuda cimport stream as stream_module
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport memory
import warnings
import cupy
import cupy._core.core as core
cdef extern from './include/cupy/dlpack/dlpack.h' nogil:
cdef int DLPACK_VERSION
cdef enum DLDeviceType:
kDLCPU
kDLCUDA
kDLCUDAHost
kDLOpenCL
kDLVulkan
kDLMetal
kDLVPI
kDLROCM
kDLROCMHost
kDLExtDev
kDLCUDAManaged
kDLOneAPI
kDLWebGPU
kDLHexagon
ctypedef struct DLDevice:
DLDeviceType device_type
int32_t device_id
cdef enum DLDataTypeCode:
kDLInt
kDLUInt
kDLFloat
kDLBfloat
kDLComplex
kDLBool
ctypedef struct DLDataType:
uint8_t code
uint8_t bits
uint16_t lanes
ctypedef struct DLTensor:
void* data
DLDevice device
int32_t ndim
DLDataType dtype
int64_t* shape
int64_t* strides
uint64_t byte_offset
ctypedef struct DLManagedTensor:
DLTensor dl_tensor
void* manager_ctx
void (*deleter)(DLManagedTensor*) # noqa: E211
def get_build_version():
return str(DLPACK_VERSION)
cdef void pycapsule_deleter(object dltensor):
cdef DLManagedTensor* dlm_tensor
# Do not invoke the deleter on a used capsule
if cpython.PyCapsule_IsValid(dltensor, 'dltensor'):
dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
dltensor, 'dltensor')
dlm_tensor.deleter(dlm_tensor)
cdef void deleter(DLManagedTensor* tensor) with gil:
if tensor.manager_ctx is NULL:
return
stdlib.free(tensor.dl_tensor.shape)
cpython.Py_DECREF(<_ndarray_base>tensor.manager_ctx)
tensor.manager_ctx = NULL
stdlib.free(tensor)
# The name of this function is following the framework integration guide of
# TensorComprehensions.
cpdef object toDlpack(_ndarray_base array) except +:
cdef DLManagedTensor* dlm_tensor = \
<DLManagedTensor*>stdlib.malloc(sizeof(DLManagedTensor))
cdef size_t ndim = array._shape.size()
cdef DLTensor* dl_tensor = &dlm_tensor.dl_tensor
cdef intptr_t data_ptr = array.data.ptr
dl_tensor.data = <void*>data_ptr
dl_tensor.ndim = ndim
cdef int64_t* shape_strides = \
<int64_t*>stdlib.malloc(ndim * sizeof(int64_t) * 2)
for n in range(ndim):
shape_strides[n] = array._shape[n]
dl_tensor.shape = shape_strides
for n in range(ndim):
shape_strides[n + ndim] = array._strides[n] // array.dtype.itemsize
dl_tensor.strides = shape_strides + ndim
dl_tensor.byte_offset = 0
cdef DLDevice* device = &dl_tensor.device
cdef bint is_managed
cdef int dev_id = array.data.device_id
if not runtime._is_hip_environment:
attrs = runtime.pointerGetAttributes(data_ptr)
is_managed = (attrs.type == runtime.memoryTypeManaged)
if is_managed:
device.device_type = kDLCUDAManaged
dev_id = 0 # make it accessible on CPU too
else:
device.device_type = kDLCUDA
else:
device.device_type = kDLROCM
device.device_id = dev_id
cdef DLDataType* dtype = &dl_tensor.dtype
if array.dtype.kind == 'u':
dtype.code = <uint8_t>kDLUInt
elif array.dtype.kind == 'i':
dtype.code = <uint8_t>kDLInt
elif array.dtype.kind == 'f':
dtype.code = <uint8_t>kDLFloat
elif array.dtype.kind == 'c':
dtype.code = <uint8_t>kDLComplex
elif array.dtype.kind == 'b':
dtype.code = <uint8_t>kDLBool
else:
raise ValueError('Unknown dtype')
dtype.lanes = <uint16_t>1
dtype.bits = <uint8_t>(array.dtype.itemsize * 8)
dlm_tensor.manager_ctx = <void*>array
cpython.Py_INCREF(array)
dlm_tensor.deleter = deleter
return cpython.PyCapsule_New(dlm_tensor, 'dltensor', pycapsule_deleter)
# TODO(leofang): Support kDLCUDAPinned and kDLROCMPinned
cdef class DLPackMemory(memory.BaseMemory):
"""Memory object for a dlpack tensor.
This does not allocate any memory.
"""
cdef DLManagedTensor* dlm_tensor
cdef object dltensor
def __init__(self, object dltensor):
cdef DLManagedTensor* dlm_tensor
# sanity checks
if not cpython.PyCapsule_IsValid(dltensor, 'dltensor'):
raise ValueError('A DLPack tensor object cannot be consumed '
'multiple times')
dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
dltensor, 'dltensor')
if runtime._is_hip_environment:
if dlm_tensor.dl_tensor.device.device_type != kDLROCM:
raise RuntimeError('CuPy is built against ROCm/HIP, different '
'from the backend that backs the incoming '
'DLPack tensor')
else:
if dlm_tensor.dl_tensor.device.device_type not in (
kDLCUDA, kDLCUDAManaged):
raise RuntimeError('CuPy is built against CUDA, different '
'from the backend that backs the incoming '
'DLPack tensor')
self.dltensor = dltensor
self.dlm_tensor = dlm_tensor
self.ptr = <intptr_t>dlm_tensor.dl_tensor.data
if dlm_tensor.dl_tensor.device.device_type == kDLCUDAManaged:
# look up the actual physical device as the id from
# dl_tensor could be 0
attrs = runtime.pointerGetAttributes(self.ptr)
self.device_id = attrs.device
else:
self.device_id = dlm_tensor.dl_tensor.device.device_id
cdef int n = 0, s = 0
cdef int ndim = dlm_tensor.dl_tensor.ndim
cdef int64_t* shape = dlm_tensor.dl_tensor.shape
for s in shape[:ndim]:
n += s
self.size = dlm_tensor.dl_tensor.dtype.bits * n // 8
def __dealloc__(self):
cdef DLManagedTensor* dlm_tensor = self.dlm_tensor
# dlm_tensor could be uninitialized if an error is raised in __init__
if dlm_tensor != NULL:
dlm_tensor.deleter(dlm_tensor)
# The name of this function is following the framework integration guide of
# TensorComprehensions.
cpdef _ndarray_base fromDlpack(object dltensor) except +:
"""Zero-copy conversion from a DLPack tensor to a :class:`~cupy.ndarray`.
DLPack is a open in memory tensor structure proposed in this repository:
`dmlc/dlpack <https://github.com/dmlc/dlpack>`_.
This function takes a :class:`PyCapsule` object which contains a pointer to
a DLPack tensor as input, and returns a :class:`~cupy.ndarray`. This
function does not copy the data in the DLPack tensor but both
DLPack tensor and :class:`~cupy.ndarray` have pointers which are pointing
to the same memory region for the data.
Args:
dltensor (:class:`PyCapsule`): Input DLPack tensor which is
encapsulated in a :class:`PyCapsule` object.
Returns:
array (:class:`~cupy.ndarray`): A CuPy ndarray.
.. warning::
This function is deprecated in favor of :func:`~cupy.from_dlpack` and
will be removed in a future version of CuPy.
.. warning::
As of the DLPack v0.5 specification, it is implicitly assumed that
the user is responsible to ensure the Producer and the Consumer are
operating on the same stream.
.. seealso::
:meth:`cupy.ndarray.toDlpack` is a method for zero-copy conversion
from a :class:`~cupy.ndarray` to a DLPack tensor (which is encapsulated
in a :class:`PyCapsule` object).
.. admonition:: Example
>>> import cupy
>>> array1 = cupy.array([0, 1, 2], dtype=cupy.float32)
>>> dltensor = array1.toDlpack()
>>> array2 = cupy.fromDlpack(dltensor)
>>> cupy.testing.assert_array_equal(array1, array2)
"""
warnings.warn('This function is deprecated in favor of cupy.from_dlpack',
DeprecationWarning)
return _dlpack_to_cupy_array(dltensor)
cdef inline _ndarray_base _dlpack_to_cupy_array(dltensor) except +:
cdef DLPackMemory mem = DLPackMemory(dltensor)
cdef DLDataType dtype = mem.dlm_tensor.dl_tensor.dtype
cdef int bits = dtype.bits
if dtype.lanes != 1:
raise ValueError(f'vector dtypes (lanes={dtype.lanes}) is '
'not supported')
if dtype.code == kDLUInt:
if bits == 8:
cp_dtype = cupy.uint8
elif bits == 16:
cp_dtype = cupy.uint16
elif bits == 32:
cp_dtype = cupy.uint32
elif bits == 64:
cp_dtype = cupy.uint64
else:
raise TypeError('uint{} is not supported.'.format(bits))
elif dtype.code == kDLInt:
if bits == 8:
cp_dtype = cupy.int8
elif bits == 16:
cp_dtype = cupy.int16
elif bits == 32:
cp_dtype = cupy.int32
elif bits == 64:
cp_dtype = cupy.int64
else:
raise TypeError('int{} is not supported.'.format(bits))
elif dtype.code == kDLFloat:
if bits == 16:
cp_dtype = cupy.float16
elif bits == 32:
cp_dtype = cupy.float32
elif bits == 64:
cp_dtype = cupy.float64
else:
raise TypeError('float{} is not supported.'.format(bits))
elif dtype.code == kDLComplex:
# TODO(leofang): support complex32
if bits == 64:
cp_dtype = cupy.complex64
elif bits == 128:
cp_dtype = cupy.complex128
else:
raise TypeError('complex{} is not supported.'.format(bits))
elif dtype.code == kDLBool:
if bits == 8:
cp_dtype = cupy.bool_
else:
raise TypeError(f'{bits}-bit bool is not supported')
elif dtype.code == kDLBfloat:
raise NotImplementedError('CuPy does not support bfloat16 yet')
else:
raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code))
mem_ptr = memory.MemoryPointer(mem, mem.dlm_tensor.dl_tensor.byte_offset)
cdef int64_t ndim = mem.dlm_tensor.dl_tensor.ndim
cdef int64_t* shape = mem.dlm_tensor.dl_tensor.shape
cdef vector[Py_ssize_t] shape_vec
shape_vec.assign(shape, shape + ndim)
if mem.dlm_tensor.dl_tensor.strides is NULL:
# Make sure this capsule will never be used again.
cpython.PyCapsule_SetName(mem.dltensor, 'used_dltensor')
return core.ndarray(shape_vec, cp_dtype, mem_ptr, strides=None)
cdef int64_t* strides = mem.dlm_tensor.dl_tensor.strides
cdef vector[Py_ssize_t] strides_vec
for i in range(ndim):
strides_vec.push_back(strides[i] * (bits // 8))
# Make sure this capsule will never be used again.
cpython.PyCapsule_SetName(mem.dltensor, 'used_dltensor')
return core.ndarray(shape_vec, cp_dtype, mem_ptr, strides=strides_vec)
cpdef from_dlpack(array):
"""Zero-copy conversion between array objects compliant with the DLPack
data exchange protocol.
Args:
array (object): an array object that implements two methods:
``__dlpack__()`` and ``__dlpack_device__()``.
Returns:
cupy.ndarray: a CuPy array that can be safely accessed on CuPy's
current stream.
.. note::
This function is different from CuPy's legacy :func:`~cupy.fromDlpack`
function. This function takes any object implementing the DLPack data
exchange protocol, as well as a raw :class:`PyCapsule` object that
contains the DLPack tensor as input (for backward compatibility),
whereas :func:`~cupy.fromDlpack` only accepts :class:`PyCapsule`
objects. If the input object is not compliant with the protocol, users
are responsible to ensure data safety.
.. seealso::
:func:`numpy.from_dlpack`,
`Python Specification for DLPack`_,
`Data interchange mechanisms`_
.. _Python Specification for DLPack:
https://dmlc.github.io/dlpack/latest/python_spec.html
.. _Data interchange mechanisms:
https://data-apis.org/array-api/latest/design_topics/data_interchange.html
"""
if not hasattr(array, '__dlpack_device__'):
# backward compatibility: accept passing in a pycapsule
dltensor = array
return _dlpack_to_cupy_array(dltensor)
else:
dev_type, dev_id = array.__dlpack_device__()
# CuPy is the consumer, so we provide our current stream to the producer
if dev_type == <int>kDLCUDA or dev_type == <int>kDLCUDAManaged:
prev_device = cupy.cuda.runtime.getDevice()
try:
cupy.cuda.runtime.setDevice(dev_id)
assert not runtime._is_hip_environment
stream = stream_module.get_current_stream_ptr()
if stream == 0:
stream = stream_module.get_default_stream_ptr()
dltensor = array.__dlpack__(stream=stream)
finally:
cupy.cuda.runtime.setDevice(prev_device)
elif dev_type == <int>kDLROCM:
prev_device = cupy.cuda.runtime.getDevice()
try:
cupy.cuda.runtime.setDevice(dev_id)
assert runtime._is_hip_environment
stream = stream_module.get_current_stream_ptr()
dltensor = array.__dlpack__(stream=stream)
finally:
cupy.cuda.runtime.setDevice(prev_device)
elif dev_type == <int>kDLCPU:
raise TypeError(
'CPU arrays cannot be directly imported to CuPy. '
'Use `cupy.array(numpy.from_dlpack(input))` instead.')
else:
# TODO(leofang): support kDLCUDAPinned etc
dltensor = None
raise TypeError(f'Unsupported array type: {dev_type}')
return _dlpack_to_cupy_array(dltensor)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment