Commit deb763b7 authored by root's avatar root
Browse files

clone code from github

parent 93bf084b
Pipeline #3386 canceled with stages
import numpy
from cupy._core._dtype import get_dtype
import cupy
from cupy._core import _fusion_thread_local
from cupy._core import core
from cupy._core._scalar import get_typename
_thread_local = _fusion_thread_local.thread_local
_dtype_to_astype_dict = None
def _set_dtype_to_astype_dict():
"""Set a dict with dtypes and astype ufuncs to `_dtype_to_astype_dict`.
Creates a ufunc for type cast operations, and set a dict with keys
as the dtype of the output array and values as astype ufuncs.
This function is called at most once.
"""
global _dtype_to_astype_dict
_dtype_to_astype_dict = {}
dtype_list = [numpy.dtype(type_char) for type_char in '?bhilqBHILQefdFD']
for t in dtype_list:
name = 'astype_{}'.format(t)
rules = tuple(['{}->{}'.format(s.char, t.char) for s in dtype_list])
command = 'out0 = static_cast< {} >(in0)'.format(get_typename(t))
_dtype_to_astype_dict[t] = core.create_ufunc(name, rules, command)
class _VariableProxy:
"""Abstracted array/scalar object passed to the target function.
"""
def __init__(self, content):
assert isinstance(content, cupy._core._fusion_variable._TraceVariable)
self.content = content
def __neg__(self):
return cupy.negative(self)
def __add__(self, other):
return cupy.add(self, other)
def __radd__(self, other):
return cupy.add(other, self)
def __sub__(self, other):
return cupy.subtract(self, other)
def __rsub__(self, other):
return cupy.subtract(other, self)
def __mul__(self, other):
return cupy.multiply(self, other)
def __rmul__(self, other):
return cupy.multiply(other, self)
def __div__(self, other):
return cupy.divide(self, other)
def __rdiv__(self, other):
return cupy.divide(other, self)
def __truediv__(self, other):
return cupy.true_divide(self, other)
def __rtruediv__(self, other):
return cupy.true_divide(other, self)
def __floordiv__(self, other):
return cupy.floor_divide(self, other)
def __rfloordiv__(self, other):
return cupy.floor_divide(other, self)
def __mod__(self, other):
return cupy.remainder(self, other)
def __rmod__(self, other):
return cupy.remainder(other, self)
def __pow__(self, other):
return cupy.power(self, other)
def __lshift__(self, other):
return cupy.left_shift(self, other)
def __rlshift__(self, other):
return cupy.left_shift(other, self)
def __rshift__(self, other):
return cupy.right_shift(self, other)
def __rrshift__(self, other):
return cupy.right_shift(other, self)
def __invert__(self):
return cupy.invert(self)
def __and__(self, other):
return cupy.bitwise_and(self, other)
def __rand__(self, other):
return cupy.bitwise_and(other, self)
def __or__(self, other):
return cupy.bitwise_or(self, other)
def __ror__(self, other):
return cupy.bitwise_or(other, self)
def __xor__(self, other):
return cupy.bitwise_xor(self, other)
def __rxor__(self, other):
return cupy.bitwise_xor(other, self)
def __lt__(self, other):
return cupy.less(self, other)
def __le__(self, other):
return cupy.less_equal(self, other)
def __eq__(self, other):
return cupy.equal(self, other)
def __ne__(self, other):
return cupy.not_equal(self, other)
def __ge__(self, other):
return cupy.greater_equal(self, other)
def __gt__(self, other):
return cupy.greater(self, other)
def copy(self):
return cupy.copy(self)
def astype(self, dtype, order=None, casting=None, subok=None, copy=True):
dtype = get_dtype(dtype)
if order is not None:
raise TypeError('order is not supported yet')
if casting is not None:
raise TypeError('casting is not supported yet')
if subok is not None:
raise TypeError('subok is not supported yet')
if not copy and self.dtype == dtype:
return self
if _dtype_to_astype_dict is None:
_set_dtype_to_astype_dict()
return _dtype_to_astype_dict[dtype](self)
def sum(self, axis=None, dtype=None, out=None, keepdims=False):
return cupy.sum(
self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
def prod(self, axis=None, dtype=None, out=None, keepdims=False):
return cupy.prod(
self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
def max(self, axis=None, out=None, keepdims=False):
return cupy.max(self, axis=axis, out=out, keepdims=keepdims)
def min(self, axis=None, out=None, keepdims=False):
return cupy.min(self, axis=axis, out=out, keepdims=keepdims)
def all(self, axis=None, out=None, keepdims=False):
return cupy.all(self, axis=axis, out=out, keepdims=keepdims)
def any(self, axis=None, out=None, keepdims=False):
return cupy.any(self, axis=axis, out=out, keepdims=keepdims)
@property
def dtype(self):
return self.content.dtype
@property
def ndim(self):
return self.content.ndim
@property
def shape(self):
raise NotImplementedError('`shape` is not supported, currently.')
class _ScalarProxy(_VariableProxy):
"""An abstracted scalar object passed to the target function.
Attributes:
dtype(dtype): The dtype of the array.
imag(_ArrayProxy): The imaginary part of the array (Not implemented)
real(_ArrayProxy): The real part of the array (Not implemented)
ndim(int): The number of dimensions of the array.
"""
def __repr__(self):
return '_ScalarProxy({}, dtype={})'.format(
self._emit_param_name(), self.dtype)
class _ArrayProxy(_VariableProxy):
"""An abstracted array object passed to the target function.
Attributes:
dtype(dtype): The dtype of the array.
imag(_ArrayProxy): The imaginary part of the array (Not implemented)
real(_ArrayProxy): The real part of the array (Not implemented)
ndim(int): The number of dimensions of the array.
"""
def __repr__(self):
return '_ArrayProxy([...], dtype=\'{}\', ndim={})'.format(
self.dtype.char, self.ndim)
def _inplace_op(self, ufunc, other):
return ufunc(self, other, self)
def __iadd__(self, other):
return self._inplace_op(cupy.add, other)
def __isub__(self, other):
return self._inplace_op(cupy.subtract, other)
def __imul__(self, other):
return self._inplace_op(cupy.multiply, other)
def __idiv__(self, other):
return self._inplace_op(cupy.divide, other)
def __itruediv__(self, other):
return self._inplace_op(cupy.true_divide, other)
def __ifloordiv__(self, other):
return self._inplace_op(cupy.floor_divide, other)
def __imod__(self, other):
return self._inplace_op(cupy.remainder, other)
def __ipow__(self, other):
return self._inplace_op(cupy.power, other)
def __ilshift__(self, other):
return self._inplace_op(cupy.left_shift, other)
def __irshift__(self, other):
return self._inplace_op(cupy.right_shift, other)
def __iand__(self, other):
return self._inplace_op(cupy.bitwise_and, other)
def __ior__(self, other):
return self._inplace_op(cupy.bitwise_or, other)
def __ixor__(self, other):
return self._inplace_op(cupy.bitwise_xor, other)
def __getitem__(self, index):
return _fusion_thread_local.call_indexing(self, index)
def __setitem__(self, slices, value):
if slices is Ellipsis or (
isinstance(slices, slice) and slices == slice(None)):
_fusion_thread_local.call_ufunc(
core.elementwise_copy, value, out=self)
else:
raise ValueError('The fusion supports `[...]` or `[:]`.')
import itertools
import string
from libcpp cimport vector
from cupy._core cimport _carray
from cupy._core.core cimport _ndarray_init
from cupy._core.core cimport compile_with_cache
from cupy._core.core cimport _ndarray_base
from cupy._core cimport internal
from cupy._core cimport _routines_manipulation as _manipulation
from cupy_backends.cuda.api cimport driver
from cupy_backends.cuda.api cimport runtime
import cupy as _cupy
from cupy._core import _dtype
from cupy import _util
from cupy._core import _codeblock
from cupy._core import _fusion_op
from cupy._core._fusion_variable import _TraceVariable
from cupy._core._fusion_variable import _TraceScalar
from cupy._core._fusion_variable import _TraceArray
cdef Py_ssize_t _default_block_size = (
256 if runtime._is_hip_environment else 512)
@_util.memoize(for_each_device=True)
def _cuda_compile(preamble, name, cuda_params, cuda_body, use_grid_sync):
template = (
'${preamble}\n\n'
'extern "C" __global__ void ${name}(${cuda_params}) ${cuda_body}\n'
)
if use_grid_sync:
template = '#include <cooperative_groups.h>\n\n' + template
code = string.Template(template).substitute(
preamble=preamble,
name=name,
cuda_params=cuda_params,
cuda_body=cuda_body)
# (For contributers) We can view the whole generated CUDA code
# by uncommenting the following line.
# print(code)
module = compile_with_cache(
code, (), None, None, True, 'nvrtc', False, use_grid_sync)
return module.get_function(name)
cdef class FusedKernel:
cdef:
readonly object shape_constraints
readonly str _name
readonly list _params
readonly int _return_size
readonly str _submodule_code
readonly str _cuda_body
readonly dict _cuda_params_memo
readonly list _block_strides
readonly bint _use_grid_sync
readonly list _reduction_in_array
readonly list _reduction_out_array
readonly vector.vector[bint] _is_base
readonly list _dtypes
readonly vector.vector[Py_ssize_t] _input_index
readonly vector.vector[Py_ssize_t] _view_of
readonly vector.vector[Py_ssize_t] _out_params
def __init__(self, name, trace_result):
op_list = trace_result.op_list
params = trace_result.params
return_size = trace_result.return_size
self.shape_constraints = trace_result.shape_constraints
self._name = name
self._params = sorted(params, key=lambda x: x.serial_number)
self._cuda_params_memo = {}
# Generate the device functions.
submodule_code = '\n\n'.join(set(itertools.chain.from_iterable([
op.emit_preamble_codes() for op in op_list]))) + '\n\n'
submodule_code += '\n\n'.join(itertools.chain.from_iterable([
op.emit_submodule_codes() for op in op_list]))
# Generate the function body of a __global__ function.
codes = []
self._use_grid_sync = len(op_list) > 1
if self._use_grid_sync:
codes.append('namespace _cg = cooperative_groups;')
codes.append('_cg::grid_group _grid = _cg::this_grid();')
for i, op in enumerate(op_list):
if i > 0:
codes.append('_cg::sync(_grid);')
codes.append(op.emit_code())
self._submodule_code = submodule_code
self._cuda_body = str(_codeblock.CodeBlock('', codes))
# Check the format of the return value.
if return_size == 'none':
self._return_size = -1
self._out_params.resize(0)
elif return_size == 'single':
self._return_size = -2
self._out_params.resize(1)
else:
assert isinstance(return_size, int)
assert return_size >= 0
self._return_size = return_size
self._out_params.resize(return_size)
for p in self._params:
assert isinstance(p, _TraceVariable)
# Analyse the relationship between variables.
array_dict = {}
self._reduction_in_array = []
self._reduction_out_array = []
self._dtypes = []
for i, p in enumerate(self._params):
view_of = -1
input_index = -1
if p.input_index is not None:
input_index = p.input_index
if isinstance(p, _TraceArray):
if p._view_of is not None:
view_of = array_dict[p._view_of.key()]
if p.is_output:
self._out_params[p.output_index] = i
array_dict[p.key()] = i
self._is_base.push_back(p.is_base)
self._dtypes.append(_dtype.get_dtype(p.dtype))
self._input_index.push_back(input_index)
self._view_of.push_back(view_of)
self._block_strides = []
for op in op_list:
if isinstance(op, _fusion_op._ReductionTraceOp):
self._reduction_in_array.append(
array_dict[op.in_params.item().key()])
self._reduction_out_array.append(
array_dict[op.out_params.item().key()])
self._block_strides.append(
'int {}'.format(op.block_stride_name))
def get_shapes_of_kernel_params(self, tuple args):
"""Returns the shapes of parameters passed to kern.linear_launch.
"""
cdef list kernel_param_shapes = []
cdef int axis
cdef list shape
for param in self._params:
shape = []
if isinstance(param, _TraceArray):
ashape = param.ashape
for axis in range(len(ashape)):
dim = ashape[axis]
if not isinstance(dim, int):
dim = args[dim.input_index].shape[dim.axis]
shape.append(dim)
kernel_param_shapes.append(tuple(shape))
return kernel_param_shapes
cdef list _get_ndarray_list(self, tuple args, list shapes):
"""Get the list of ndarray corresponding to ``self._params``.
"""
cdef list ndarray_list = []
cdef list params = self._params
cdef int i
for i in range(len(params)):
param = params[i]
shape = shapes[i]
if self._input_index[i] >= 0:
array = args[<Py_ssize_t>self._input_index[i]]
elif isinstance(param, _TraceScalar):
array = None
elif self._is_base[i]:
array = _ndarray_init(
_cupy.ndarray, shape, self._dtypes[i], None)
else:
view_of = ndarray_list[<Py_ssize_t>self._view_of[i]]
if param.is_broadcast:
array = _manipulation.broadcast_to(view_of, shape)
elif param.slice_key is not None:
array = view_of[param.slice_key]
elif param.rotate_axis is not None:
axis_permutes = list(param.rotate_axis)
for i in range(param.ndim):
if i not in param.rotate_axis:
axis_permutes.append(i)
axis_permutes = tuple(axis_permutes)
array = _manipulation._transpose(view_of, axis_permutes)
else:
assert False
# For debug
# if isinstance(array, ndarray) and param.rotate_axis is None:
# assert array.shape == shape, (array.shape, shape)
ndarray_list.append(array)
return ndarray_list
cdef object _get_return_value(self, list ndarray_list):
"""Get the return value of ``self.execute``.
"""
cdef int i
if self._return_size == -1:
return None
if self._return_size == -2:
return ndarray_list[<Py_ssize_t>self._out_params[0]]
return tuple([
ndarray_list[<Py_ssize_t>self._out_params[i]]
for i in range(self._return_size)
])
cdef tuple _get_kernel_size(self, list ndarray_list):
"""Calculate the numnber of contiguous blocks in non-reduction axes
of input arrays, and set them to ``self._contiguous_size``.
"""
cdef _ndarray_base in_array, out_array
cdef Py_ssize_t block_size, block_stride, contiguous_size
cdef list block_strides = []
if len(self._reduction_in_array) == 0:
return [], 256, 0
block_size = _default_block_size
for i in range(len(self._reduction_in_array)):
in_array = ndarray_list[self._reduction_in_array[i]]
out_array = ndarray_list[self._reduction_out_array[i]]
# TODO(asi1024): Fix block strides for performance.
contiguous_size = 1
itemsize = in_array.dtype.itemsize
for i in range(out_array.ndim):
if in_array.strides[-i-1] != contiguous_size * itemsize:
break
contiguous_size *= in_array.shape[-i-1]
contiguous_size = min(contiguous_size, 32)
reduce_block_size = max(1, in_array.size // max(1, out_array.size))
block_stride = max(
contiguous_size, block_size // reduce_block_size)
block_stride = internal.clp2(block_stride // 2 + 1) # floor
block_strides.append(block_stride)
shared_mem = block_size * 32 # max bytesize of reduce_ctype.
return block_strides, block_size, shared_mem
cdef tuple _reduce_dims(self, list ndarray_list):
"""Reduce number of dimensions of ndarrays and returns the cache key.
"""
cdef list params = self._params
cdef list ndims = []
cdef _ndarray_base array
cdef int i
for i in range(len(params)):
param = params[i]
if param.ndim <= 1:
continue
array = ndarray_list[i]
array = array.reduced_view()
ndarray_list[i] = array
ndims.append(array.ndim)
return tuple(ndims)
cdef list _get_inout_args(self, tuple args, list ndarray_list):
"""Get the arguments passed to ``kern.linear_launch``.
"""
cdef list params = []
cdef list indexers = []
cdef _carray.Indexer indexer
for i in range(len(self._params)):
array = ndarray_list[i]
if isinstance(array, _ndarray_base):
indexer = _carray.Indexer.__new__(_carray.Indexer)
indexer.init(array._shape)
indexers.append(indexer)
params.append(array)
elif self._input_index[i] >= 0:
obj = args[<Py_ssize_t>self._input_index[i]]
params.append(obj)
return params + indexers
cdef str _get_cuda_params(self, tuple key, list ndarray_list):
"""Get a string of parameters of CUDA main function code.
"""
cdef int i
if key in self._cuda_params_memo:
return self._cuda_params_memo[key]
cuda_params = []
indexers = []
for i in range(len(self._params)):
a = self._params[i]
if isinstance(a, _TraceArray):
array = ndarray_list[i]
ndim = array.ndim
c_contiguous = 'true' if array._c_contiguous else 'false'
index_32_bits = 'true' if array._index_32_bits else 'false'
cuda_params.append(a.format(
'CArray<${type}, ${ndim}, ${cont}, ${ind32}> ${var}',
ndim=ndim, cont=c_contiguous, ind32=index_32_bits))
indexers.append(
a.format('CIndexer<${ndim}> ${indexer}', ndim=ndim))
elif isinstance(a, _TraceScalar):
if a.const_value is None:
cuda_params.append(a.format('${type} ${var}'))
else:
raise TypeError('Unknown type {}.'.format(type(a)))
ret = cuda_params + indexers + self._block_strides
ret = ', '.join(ret)
self._cuda_params_memo[key] = ret
return ret
def execute(self, tuple args, list shapes):
ndarray_list = self._get_ndarray_list(args, shapes)
ret = self._get_return_value(ndarray_list)
reduce_key = self._reduce_dims(ndarray_list)
inout_args = self._get_inout_args(args, ndarray_list)
cuda_params = self._get_cuda_params(reduce_key, ndarray_list)
kern = _cuda_compile(
self._submodule_code, self._name, cuda_params, self._cuda_body,
self._use_grid_sync)
block_strides, block_size, shared_mem = (
self._get_kernel_size(ndarray_list))
# TODO(asi1024): Optimize kernel size parameter.
if not runtime._is_hip_environment:
kern_size = driver.occupancyMaxActiveBlocksPerMultiprocessor(
kern.ptr, block_size, shared_mem) * block_size
else:
# In HIP sometimes the occupancy calc seems to be broken
kern_size = block_size * 512
kargs = inout_args + block_strides
kern.linear_launch(
kern_size, kargs, shared_mem, block_size,
enable_cooperative_groups=self._use_grid_sync)
return ret
import string
import numpy
from cupy._core import _codeblock
from cupy._core._fusion_variable import _TraceVariable
from cupy._core._fusion_variable import _TraceArray
from cupy._core._fusion_variable import _VariableSet
from cupy._core import _fusion_thread_local
from cupy._core import _kernel
from cupy._core import _reduction
from cupy._core._scalar import get_typename
class _UfuncRoutine:
"""A device function for single elementwise operations.
"""
def __init__(
self, name, ufunc, routine_code, in_params, out_params,
compute_dtypes):
assert isinstance(name, str)
assert isinstance(ufunc, _kernel.ufunc)
assert isinstance(routine_code, str)
assert isinstance(compute_dtypes, tuple)
assert all(isinstance(t, numpy.dtype) for t in compute_dtypes)
assert isinstance(in_params, list)
assert all(isinstance(p, _TraceVariable) for p in in_params)
assert isinstance(out_params, list)
assert all(isinstance(p, _TraceArray) for p in out_params)
self.name = name
self.in_params = in_params
self.out_params = out_params
self.preamble = ufunc._preamble
self.routine_code = routine_code
self.compute_dtypes = compute_dtypes
def emit_code(self):
"""Returns a CUDA device function code.
Returns a string like:
```
__device__ void cupy_add_0(int &in0_, float &in1_, double &out0_) {
typedef double in0_type;
typedef double in1_type;
typedef double out0_type;
double in0 = (double) in0_;
double in1 = (double) in1_;
double out0 = (double) out0_;
out0 = in0 + in1;
out0_ = out0;
}
```
"""
nin = len(self.in_params)
dtypes = self.compute_dtypes
assert len(self.in_params) == len(self.compute_dtypes[:nin])
in_params = [
(get_typename(p.dtype), get_typename(t), 'in{}'.format(i))
for i, (p, t) in enumerate(zip(self.in_params, dtypes[:nin]))
]
out_params = [
(get_typename(p.dtype), get_typename(t), 'out{}'.format(i))
for i, (p, t) in enumerate(zip(self.out_params, dtypes[nin:]))
]
params = in_params + out_params
params_code = ', '.join(['{} &{}_'.format(t, s) for t, _, s in params])
typedef = ['typedef {} {}_type;'.format(t, s) for _, t, s in params]
read = ['{} {} = ({}) {}_;'.format(t, s, t, s) for _, t, s in params]
write = ['{}_ = {};'.format(s, s) for _, _, s in out_params]
return _codeblock.CodeBlock(
'__device__ void {}({})'.format(self.name, params_code),
typedef + read + [self.routine_code + ';'] + write)
def emit_call_code(self):
params = self.in_params + self.out_params
return '{op_name}({params});'.format(
op_name=self.name,
params=', '.join([var.lvar_name for var in params]))
class _ElementwiseTraceOp:
"""Ufunc or elementwise kernel with types.
"""
def __init__(self, ufunc_routines, in_params, out_params, ashape):
# The `in_params` and `out_params` should be already broadcasted to
# `ashape`, but they don't guarantee to be exactly same as
# `param.ashape`.
_fusion_thread_local.check_not_runtime()
assert isinstance(ufunc_routines, list)
assert all(isinstance(r, _UfuncRoutine) for r in ufunc_routines)
assert isinstance(ashape, tuple)
self.ops = ufunc_routines
self.in_params = _VariableSet(*in_params)
self.out_params = _VariableSet(*out_params)
self.ashape = ashape
@property
def params(self):
"""Returns the set of all variable the loop uses.
"""
res = _VariableSet()
for op in self.ops:
res += _VariableSet(*op.in_params)
res += _VariableSet(*op.out_params)
return res
@staticmethod
def _emit_declaration(params, in_params):
"""Returns a tuple of size 2.
1. CUDA code: declaring local variables.
2. The set of arrays which require indexer.
"""
_fusion_thread_local.check_not_runtime()
indexed_arrays = _VariableSet()
code = []
for var in params:
if var in in_params:
if isinstance(var, _TraceArray):
indexed_arrays.add(var)
f = '${type} ${lvar} = ${var}[${indexer}.get()];'
else:
f = '${type} ${lvar} = ${var};'
else:
f = '${type} ${lvar};'
code.append(var.format(f))
return code, indexed_arrays
@staticmethod
def _emit_after_operation(out_params):
"""Returns a tuple of size 2.
1. CUDA code: writing the results of operations back to global memory.
2. The set of arrays which require indexer.
"""
_fusion_thread_local.check_not_runtime()
indexed_arrays = _VariableSet()
codes = []
for var in out_params:
if isinstance(var, _TraceArray):
indexed_arrays.add(var)
f = '${var}[${indexer}.get()] = ${lvar};'
else:
f = '${var} = ${lvar};'
codes.append(var.format(f))
return codes, indexed_arrays
@staticmethod
def _emit_set_index(indexed_params, tid):
"""Returns a CUDA code: setting a raw index to indexers.
"""
_fusion_thread_local.check_not_runtime()
assert isinstance(indexed_params, _VariableSet)
return [
p.format('${indexer}.set(${tid});', tid=tid)
for p in indexed_params
]
def emit_code(self):
_fusion_thread_local.check_not_runtime()
declaration, s1 = self._emit_declaration(self.params, self.in_params)
operation = [op.emit_call_code() for op in self.ops]
after_operation, s2 = self._emit_after_operation(self.out_params)
index_name = 'i'
indexed_array = s1 + s2
indexer_name = next(iter(indexed_array)).indexer_name
indexer_setup = self._emit_set_index(indexed_array, index_name)
return _codeblock.CodeBlock(
'CUPY_FOR({}, {}.size())'.format(index_name, indexer_name),
indexer_setup + declaration + operation + after_operation)
def emit_preamble_codes(self):
return [subm.preamble for subm in self.ops if subm.preamble != '']
def emit_submodule_codes(self):
return [str(subm.emit_code()) for subm in self.ops]
class _ReductionTraceOp:
def __init__(self, name, reduce_func, expr, in_param, out_param, axis):
"""Reduction operation.
"""
_fusion_thread_local.check_not_runtime()
assert isinstance(name, str)
assert isinstance(reduce_func, _reduction._SimpleReductionKernel)
assert isinstance(in_param, _TraceArray)
assert isinstance(out_param, _TraceArray)
assert isinstance(axis, tuple)
assert all(0 <= x < in_param.ndim for x in axis)
self.name = name
self.preamble = reduce_func.preamble
self.in_params = _VariableSet(in_param)
self.out_params = _VariableSet(out_param)
self.block_stride_name = 'block_stride_' + name
self.axis = axis
if reduce_func.identity is None:
self.identity = ''
else:
self.identity = str(reduce_func.identity)
_, self.expr, self.postmap_cast_code, self.reduce_ctype = expr
if self.reduce_ctype is None:
out_param, = self.out_params
self.reduce_ctype = get_typename(out_param.dtype)
self.premap_op = None
self.postmap_op = None
@property
def params(self):
return self.in_params + self.out_params
def emit_code(self):
_fusion_thread_local.check_not_runtime()
assert len(self.in_params) == 1
assert len(self.out_params) == 1
in_param = list(self.in_params)[0]
out_param = list(self.out_params)[0]
params = ', '.join([
in_param.var_name,
out_param.var_name,
in_param.indexer_name,
out_param.indexer_name,
])
return '{}({}, {});'.format(
self.name, params, self.block_stride_name)
def emit_preamble_codes(self):
preamble = self.preamble
return [preamble] if preamble != '' else []
def emit_submodule_codes(self):
"""Returns a CUDA device function code.
The emitted code assumes that ``block_stride`` and `blockDim.x` is a
power of 2.
"""
in_param, = self.in_params
out_param, = self.out_params
op_name = '{}_op'.format(self.name)
postmap_name = '{}_postmap'.format(self.name)
template = string.Template('''
#define ${op_name}(a, b) (${reduce_expr})
#define ${postmap_name}(a, out0) (${postmap_cast})
template <typename InType, typename OutType, typename InIndexerType, typename OutIndexerType>
__device__ void ${name}(
InType in_arr, OutType out_arr,
InIndexerType in_ind, OutIndexerType out_ind, int block_stride) {
typedef ${in_type} type_in0_raw;
typedef ${out_type} type_out0_raw;
typedef ${reduce_ctype} _type_reduce;
extern __shared__ char _sdata_raw[];
_type_reduce *sdata = reinterpret_cast<_type_reduce*>(_sdata_raw);
unsigned int tid = threadIdx.x;
int _J = tid >> __popc(block_stride - 1);
ptrdiff_t _j = (ptrdiff_t)_J * out_ind.size();
int J_stride = blockDim.x >> __popc(block_stride - 1);
ptrdiff_t j_stride = (ptrdiff_t)J_stride * out_ind.size();
for (ptrdiff_t _i = (ptrdiff_t)blockIdx.x * block_stride; _i < out_ind.size(); _i += (ptrdiff_t)gridDim.x * block_stride) {
_type_reduce s = _type_reduce(${identity});
ptrdiff_t i = _i + (tid & (block_stride - 1));
for (ptrdiff_t j = i + _j; j < in_ind.size(); j += j_stride) {
in_ind.set(j);
s = ${op_name}(s, static_cast<_type_reduce>(in_arr[in_ind.get()]));
}
sdata[tid] = s;
__syncthreads();
for (unsigned int block = blockDim.x / 2; block >= block_stride; block >>= 1) {
if (tid < block) {
sdata[tid] = ${op_name}(sdata[tid], sdata[tid + block]);
}
__syncthreads();
}
if (tid < block_stride) {
s = sdata[tid];
}
if (tid < block_stride && i < out_ind.size()) {
out_ind.set(i);
${postmap_name}(s, out_arr[out_ind.get()]);
}
__syncthreads();
}
}''') # NOQA
code = template.substitute(
name=self.name,
op_name=op_name,
postmap_name=postmap_name,
in_type=get_typename(in_param.dtype),
out_type=get_typename(out_param.dtype),
reduce_ctype=self.reduce_ctype,
reduce_expr=self.expr,
identity=self.identity,
postmap_cast=self.postmap_cast_code
)
return [code]
from cupy._core import _fusion_variable
from cupy._core import _fusion_op
def _reduce_memory_access(ops):
required_memories = set()
for op in ops:
for p in op.in_params + op.out_params:
if p.memory.is_inout:
required_memories.add(p.memory)
for op in ops[::-1]:
in_memories = set([p.memory for p in op.in_params])
new_out_params = []
for p in op.out_params:
if p.memory in required_memories:
new_out_params.append(p)
op.out_params = _fusion_variable._VariableSet(*new_out_params)
# TODO(asi1024): The following improvement can be applicable only
# when the memory space is used at most once.
# `required_memories -= out_memories`
required_memories |= in_memories
return [op for op in ops if len(op.out_params) > 0]
def _normalize_ashapes(ops, variables, shape_constraints):
def normalize(shape):
return tuple([shape_constraints.evaluate(d) for d in shape])
for var in variables:
var.ashape = normalize(var.ashape)
for op in ops:
if isinstance(op, _fusion_op._ElementwiseTraceOp):
op.ashape = normalize(op.ashape)
def _fuse_two_ops(op1, op2):
"""Returns a fused Op if the two ops can be fused, and ``None`` otherwise.
"""
# TODO(asi1024): Supoort reduction postmap.
if not isinstance(op1, _fusion_op._ElementwiseTraceOp):
return None
# TODO(asi1024): Supoort reduction premap.
if not isinstance(op2, _fusion_op._ElementwiseTraceOp):
return None
if op1.ashape != op2.ashape:
return None
new_in_params = op1.in_params + (op2.in_params - op1.out_params)
new_out_params = op1.out_params + op2.out_params
for in_param in new_in_params:
for out_param in new_out_params:
# Checks if two arrays may share the same memory space.
if in_param.memory == out_param.memory and in_param != out_param:
return None
op1.ops.extend(op2.ops)
op1.in_params = new_in_params
op1.out_params = new_out_params
return op1
def _fuse_consecutive_ops(ops, shape_constraints):
res = []
for op in ops:
if len(res) == 0:
res.append(op)
else:
prev_op = res.pop(-1)
new_op = _fuse_two_ops(prev_op, op)
if new_op is None:
res.extend([prev_op, op])
else:
res.append(new_op)
return res
def optimize(ops, variables, shape_constraints):
_normalize_ashapes(ops, variables, shape_constraints)
ops = _reduce_memory_access(ops)
ops = _fuse_consecutive_ops(ops, shape_constraints)
ops = _reduce_memory_access(ops)
return ops
import threading
thread_local = threading.local()
cpdef inline bint is_old_fusing() except? -1:
try:
return thread_local.is_old_fusing
except AttributeError:
thread_local.is_old_fusing = False
return False
cpdef inline bint is_new_fusing() except? -1:
try:
return thread_local.is_new_fusing
except AttributeError:
thread_local.is_new_fusing = False
return False
cpdef inline bint is_fusing() except? -1:
return is_old_fusing() or is_new_fusing()
def check_not_runtime():
assert is_new_fusing()
def call_ufunc(fusion_op, *args, **kwargs):
if is_new_fusing():
return thread_local.history.call_ufunc(fusion_op, *args, **kwargs)
import cupy
return cupy._core.fusion._call_ufunc(fusion_op, *args, **kwargs)
def call_reduction(fusion_op, *args, **kwargs):
if is_new_fusing():
return thread_local.history.call_reduction(fusion_op, *args, **kwargs)
import cupy
return cupy._core.fusion._call_reduction(fusion_op, *args, **kwargs)
def call_indexing(fusion_op, *args, **kwargs):
return thread_local.history.call_indexing(fusion_op, *args, **kwargs)
import numpy
from cupy._core import _kernel
from cupy._core import _reduction
from cupy._core import core
from cupy._core._fusion_interface import _VariableProxy
from cupy._core._fusion_interface import _ArrayProxy
from cupy._core import _fusion_thread_local
from cupy._core import _fusion_variable
from cupy._core._fusion_variable import _AbstractDim
from cupy._core._fusion_variable import _TraceScalar
from cupy._core._fusion_variable import _TraceArray
from cupy._core._fusion_variable import _VariableSet
from cupy._core import _fusion_op
from cupy._core import _fusion_optimization
from cupy._core cimport internal
from cupy._core._dtype cimport _raise_if_invalid_cast
_thread_local = _fusion_thread_local.thread_local
_accepted_types = (int, float, bool, complex, numpy.generic)
cdef class _ShapeConstraints:
"""A data structure that manages the conditions between the shapes.
"""
cdef:
# A list of tuple of _AbstractDim and _AbstractDim which represents
# the equality between dimensions.
readonly list eq_constraints
# A list of tuple of _AbstractDim and int which is an associative list
readonly list const_constraints
def __init__(self):
self.eq_constraints = []
self.const_constraints = []
def add_eq_constraint(self, x, y):
"""Add a constraint: x == y.
"""
_fusion_thread_local.check_not_runtime()
assert isinstance(x, (_AbstractDim, int))
assert isinstance(y, (_AbstractDim, int))
x = self.evaluate(x)
y = self.evaluate(y)
if x == y:
return
if isinstance(x, _AbstractDim) and isinstance(y, _AbstractDim):
self.eq_constraints.append((x, y))
elif isinstance(x, _AbstractDim) and not isinstance(y, _AbstractDim):
self.add_const_constraint(x, y)
elif not isinstance(x, _AbstractDim) and isinstance(y, _AbstractDim):
self.add_const_constraint(y, x)
else:
assert False
def add_const_constraint(self, x, value):
"""Add a constraint: x == value.
"""
_fusion_thread_local.check_not_runtime()
assert isinstance(x, (_AbstractDim, int))
assert isinstance(value, int)
x = self.evaluate(x)
if isinstance(x, _AbstractDim):
self.const_constraints.append((x, value))
else:
assert x == value
def evaluate(self, x):
"""Substitute repeatedly from the equalities.
"""
_fusion_thread_local.check_not_runtime()
assert isinstance(x, (_AbstractDim, int))
for src, dest in self.eq_constraints + self.const_constraints:
if isinstance(x, int):
return x
if x == src:
x = dest
return x
# Used in runtime.
def satisfy(self, dict dim_map):
"""Check if the given dicionary satisfies the constraints.
Args:
dim_map (dict):
A dictionary with keys of _AbstractDim type and
values of int type.
"""
for a, b in self.eq_constraints:
if dim_map[a] != dim_map[b]:
return False
for a, b in self.const_constraints:
if dim_map[a] != b:
return False
return True
def _guess_routine(func, args, dtype):
assert isinstance(func, (_kernel.ufunc, _reduction._SimpleReductionKernel))
# Feeds dummy arguments with appropriate dtypes passed to `guess_routine`.
dummy_args = []
for x in args:
if isinstance(x, _TraceScalar):
obj = x.dtype.type(0)
else:
assert isinstance(x, _TraceArray)
obj = core.ndarray((0,), x.dtype)
dummy_args.append(obj)
op = func._ops.guess_routine(
func.name, func._routine_cache, dummy_args, dtype, None)
return op.get_in_dtypes(), op.get_out_dtypes(), op.routine
def _base(array):
"""Returns the base array object of given array.
"""
assert isinstance(array, core.ndarray)
return array if array.base is None else array.base
class _VariableCoordinator:
"""Variable constuct manager.
This class calls ``_TraceArray`` or ``_TraceScalar`` internally
with unique serial numbers and returns the variable object. In
``TraceImpl`` class, a method of ``history.vc``, which is of
``_VariableConduductor`` class, should be called instead of
```_TraceArray.__init__`` or ``_TraceScalar.__init__``.
"""
def __init__(self):
self._memory_number = 0
self._serial_number = 0
self._variables_dict = {}
def _normalize_variable(self, var):
"""If the input variable is already generated previously, returns it.
"""
key = var.key()
if key not in self._variables_dict:
self._variables_dict[key] = var
return self._variables_dict[key]
def _generate_new_variable(self, var_module, dtype, **kwargs):
serial_number = self._serial_number
memory = _fusion_variable._MemorySpace(
self._memory_number, serial_number)
self._serial_number += 1
self._memory_number += 1
ret = var_module(memory, serial_number, dtype, **kwargs)
memory.is_input = ret.is_input
return self._normalize_variable(ret)
def generate_new_array(self, dtype, rshape, ashape, input_index=None):
"""Generate new _TraceArray object with a new memory space.
"""
ret = self._generate_new_variable(
_TraceArray,
dtype, rshape=rshape, ashape=ashape, input_index=input_index)
ret.memory.base_ashape = ret.ashape
return ret
def generate_new_scalar(self, dtype, **kwargs):
"""Generate new _TraceScalar object with a new memory space.
"""
return self._generate_new_variable(_TraceScalar, dtype, **kwargs)
def make_view(self, var, **kwargs):
assert isinstance(var, _TraceArray)
serial_number = self._serial_number
self._serial_number += 1
ret = var.make_view(serial_number, **kwargs)
return self._normalize_variable(ret)
def broadcast_to(self, var, ashape, rshape):
"""Make a view of the input array with the given shape.
"""
return self.make_view(
var, ashape=ashape, rshape=rshape, broadcasted_from=var)
def rotate_with_axis(self, var, axis):
"""Make a view of an array by rotating ``var`` with given axis.
"""
assert isinstance(var, _TraceArray)
return self.make_view(var, rotated_from=var, axis=axis)
def indexing(self, var, indices):
"""Make a view of an array. by indexing ``var`` with given tuple.
"""
skip = var.ndim - sum([isinstance(x, (int, slice)) for x in indices])
it = 0
ashape = []
rshape = []
if skip < 0:
raise IndexError('Too many indices for array.')
for index in indices:
if isinstance(index, int):
it += 1
elif isinstance(index, slice):
if not (index.start is None
and index.stop is None
and index.step in (1, -1, None)):
raise NotImplementedError(
'Only full range ``x[::]`` or reverse ``x[::-1]`` is '
'supported for basic slicing in CuPy fusion.')
ashape.append(var.ashape[it])
rshape.append(var.rshape[it])
it += 1
elif index is None:
ashape.append(1)
rshape.append(1)
elif index is Ellipsis:
ashape.extend(var.ashape[it:it + skip])
rshape.extend(var.rshape[it:it + skip])
it += skip
ashape.extend(var.ashape[it:var.ndim])
rshape.extend(var.rshape[it:var.ndim])
return self.make_view(
var, indexed_from=var, index_key=indices,
ashape=tuple(ashape), rshape=tuple(rshape))
@property
def all_variables(self):
"""Returns the list of all variables this class emitted.
"""
return list(self._variables_dict.values())
class TraceImpl:
"""Emit a fused kernel from the given target function.
"""
def __init__(self):
self.vc = _VariableCoordinator()
self.shape_constraints = _ShapeConstraints()
self.op_list = []
@staticmethod
def _make_interface(x):
"""Returns an _array or a _scalar object which packs the given value.
"""
if x is None:
return None
assert isinstance(x, _fusion_variable._TraceVariable)
return x.as_interface()
def _unwrap_interface(self, x, *, allow_none=False):
"""Returns ``_TraceVariable`` object from the input.
"""
if allow_none and x is None:
return None
if isinstance(x, _VariableProxy):
return x.content
if isinstance(x, _accepted_types):
dtype = numpy.dtype(type(x))
return self.vc.generate_new_scalar(dtype, const_value=x)
if isinstance(x, (numpy.ndarray, core.ndarray)):
raise TypeError('Concrete ndarray is not supported in fusion.')
raise TypeError('{} type is not supported'.format(type(x)))
def call_ufunc(self, ufunc, *args, **kwargs):
"""Register an elementwise operation with the given parameters.
Args:
ufunc(_kernel.ufunc): The ufunc to operate.
args(tuple): The arguments.
kwargs(dict): The keyword arguments.
"""
assert isinstance(ufunc, _kernel.ufunc)
# Parse Inputs.
nin = ufunc.nin
nout = ufunc.nout
dtype = kwargs.pop('dtype', None)
if 'out' in kwargs and len(args) > nin:
raise ValueError(
'cannot specify \'out\' as both a positional and '
'keyword argument')
in_params = [self._unwrap_interface(x) for x in args[:nin]]
out_params = [
self._unwrap_interface(x, allow_none=True)
for x in args[nin:] + (kwargs.pop('out', None),)
if x is not None
]
params = in_params + out_params
if len(kwargs) > 0:
raise TypeError('Wrong arguments {}'.format(kwargs))
if len(in_params) != nin or len(out_params) > nout:
raise ValueError('Invalid number of arguments')
if not all([isinstance(v, _TraceArray) for v in out_params]):
raise TypeError('Return arrays must be of ArrayType')
# Check for inplace operation.
for i, out_param1 in enumerate(out_params):
for out_param2 in out_params[:i]:
if out_param1.memory == out_param2.memory:
# NumPy does not raise this error.
raise ValueError('Outputs of ufunc must not share memory')
# Copy the input array data before the operation when the input array
# shares the same memory area with an output array.
for i, in_param in enumerate(in_params):
should_copy = any([
in_param.memory == out_param.memory and in_param != out_param
for out_param in out_params
])
if should_copy:
in_params[i] = self._unwrap_interface(
self.call_ufunc(
core.elementwise_copy,
self._make_interface(in_param)))
# Broadcast shapes
out_rshape = internal._broadcast_shapes([p.rshape for p in params])
out_ashape = [None for _ in range(len(out_rshape))]
for p in params:
for axis in range(-p.ndim, 0):
if p.rshape[axis] == out_rshape[axis]:
out_ashape[axis] = p.ashape[axis]
assert all([dim is not None for dim in out_ashape])
out_ashape = tuple(out_ashape)
# Broadcast input params and make their views.
for i, p in enumerate(in_params):
for axis in range(-p.ndim, 0):
if p.rshape[axis] == out_rshape[axis]:
self.shape_constraints.add_eq_constraint(
p.ashape[axis], out_ashape[axis])
elif p.rshape[axis] == 1:
self.shape_constraints.add_const_constraint(
p.ashape[axis], 1)
else:
assert False
if isinstance(p, _TraceArray) and p.rshape != out_rshape:
# Broadcst input if needed.
in_params[i] = self.vc.broadcast_to(p, out_ashape, out_rshape)
# Get operation code from dtypes.
in_dtypes, out_dtypes, expr = _guess_routine(
ufunc, in_params, dtype)
# Make output arrays.
ret = []
for i in range(nout):
if i >= len(out_params):
# Omitted output.
out_pvar = self.vc.generate_new_array(
out_dtypes[i], out_rshape, out_ashape)
out_params.append(out_pvar)
elif isinstance(out_params, _TraceScalar):
raise TypeError('return arrays must be of ArrayType')
elif out_params[i].rshape != out_rshape:
raise ValueError(
'non-broadcastable output operand with shape {} '
'doesn\'t match the broadcast shape {}'.format(
out_params[i].rshape, out_rshape))
_raise_if_invalid_cast(
out_dtypes[i], out_params[i].dtype, 'same_kind',
'output operand')
out_pvar = out_params[i]
ret.append(out_pvar)
# Register Op.
name = ufunc.name + '_' + str(len(self.op_list))
ufunc_routine = _fusion_op._UfuncRoutine(
name, ufunc, expr, in_params, out_params, in_dtypes + out_dtypes)
op = _fusion_op._ElementwiseTraceOp(
[ufunc_routine], in_params, out_params, out_ashape)
self.op_list.append(op)
# Returns.
assert len(ret) > 0
if len(ret) == 1:
return self._make_interface(ret[0])
else:
return tuple([self._make_interface(x) for x in ret])
def call_reduction(
self, reduce_func, a, axis=None, dtype=None, out=None,
keepdims=False):
"""Register a reduction operation with the given parameters.
Args:
reduce_func(_reduction._SimpleReductionKernel):
The reduction function to operate.
a(array_like): The input array.
axis(int, tuple of int or None): The axis.
dtype(numpy.dtype or None): The dtype
out(_array or None): The output array.
"""
assert isinstance(reduce_func, _reduction._SimpleReductionKernel)
# Parse inputs.
in_param = self._unwrap_interface(a)
if not isinstance(in_param, _TraceArray):
raise NotImplementedError(
'Reduction for scalar arguments is not supported.')
axes = internal._normalize_axis_indices(axis, in_param.ndim)
if dtype is not None:
dtype = numpy.dtype(dtype)
if keepdims:
raise NotImplementedError('keepdims is not supported.')
# Determine the shape of out_param.
out_ashape = tuple([
d for axis, d in enumerate(in_param.ashape) if axis not in axes])
out_rshape = tuple([
d for axis, d in enumerate(in_param.rshape) if axis not in axes])
# Rotate axes.
# This condition is only for performance improvement,
if not all([i == axis for i, axis in enumerate(axes)]):
in_param = self.vc.rotate_with_axis(in_param, axes)
# Get operation code from dtypes.
_, (out_dtype,), expr = _guess_routine(reduce_func, [in_param], dtype)
# Make an output array.
if out is None:
# Omitted output.
out_param = self.vc.generate_new_array(
out_dtype, out_rshape, out_ashape)
else:
out_param = self._unwrap_interface(out)
if out_param.rshape != out_rshape:
raise ValueError(
'Shape of specified output variable is not consistent '
'with reduced shape.')
# Register Op.
name = 'reduce{}'.format(len(self.op_list))
op = _fusion_op._ReductionTraceOp(
name, reduce_func, expr, in_param, out_param, axes)
self.op_list.append(op)
# Returns.
return self._make_interface(out_param)
def call_indexing(self, in_param, indices):
"""Call indexing routines.
"""
in_param = self._unwrap_interface(in_param)
if not isinstance(indices, tuple):
indices = (indices,)
for x in indices:
if isinstance(indices, (list, _TraceArray)):
# Advanced indexing
raise NotImplementedError(
'Advanced indexing is not supported, currently.')
if not (isinstance(x, (int, slice)) or x is None or x is Ellipsis):
raise IndexError(
'Indices must be integers, slices, ellipsis, None or '
'integer or boolean arrays.')
# Basic indexing
out_param = self.vc.indexing(in_param, indices)
return self._make_interface(out_param)
def trace(self, func, args):
"""Call ``self.func`` with _TraceVariable arguments.
Returns:
out_params(list of _TraceVariable): The list of outputs.
return_size(int or str): If ``return_size`` is of int type,
it indicates the size of tuple of outputs.
If `none`, the output is ``None`` and ``out_params`` is empty.
If `single`, the output is single array and ``out_params``
is a singleton list.
During the function call, ``call_ufunc``, ``call_reduction`` and
``call_indexing`` are called internally.
"""
# Register input variables.
in_params = []
array_dict = {}
memory_dict = {}
for input_index, arg in enumerate(args):
if arg is None:
var = None
elif isinstance(arg, core.ndarray):
arg_id = id(arg)
base_id = id(_base(arg))
if arg_id in array_dict:
# The array is already given as an input.
var = in_params[array_dict[arg_id]]
assert isinstance(var, _TraceArray)
elif base_id in memory_dict:
# The is an array which shares the same memory.
base = in_params[memory_dict[base_id]]
assert isinstance(base, _TraceArray)
var = self.vc.make_view(base, input_index=input_index)
else:
# Otherwise.
var = self.vc.generate_new_array(
arg.dtype, arg.shape, None, input_index=input_index)
array_dict[arg_id] = input_index
memory_dict[base_id] = input_index
else:
# Scalar input.
dtype = numpy.dtype(type(arg))
var = self.vc.generate_new_scalar(
dtype, input_index=input_index)
in_params.append(var)
# Call the target function.
inputs = [self._make_interface(x) for x in in_params]
output = func(*inputs)
# Register output variables.
if output is None:
return_size = 'none'
out_params = []
elif isinstance(output, _ArrayProxy):
return_size = 'single'
out_params = [self._unwrap_interface(output, allow_none=True)]
elif isinstance(output, tuple):
if all(isinstance(x, _ArrayProxy) for x in output):
return_size = len(output)
out_params = [
self._unwrap_interface(x, allow_none=True) for x in output]
else:
raise ValueError(
'The all elements of return value of fused function '
'must be of _ArrayProxy type.'
)
else:
raise ValueError(
'The return value of fused functions must be `None`, '
'ndarray or a tuple of ndarays.'
)
for output_index, out_param in enumerate(out_params):
assert isinstance(out_param, _TraceArray)
out_param.output_index = output_index
out_param.memory.is_output = True
return out_params, return_size
def _get_ancestors_of_trace_variable(var):
if var is None:
return _VariableSet()
res = _VariableSet(var)
if isinstance(var, _TraceArray):
res += _get_ancestors_of_trace_variable(var._view_of)
return res
class _TraceResult:
def __init__(self, op_list, params, return_size, shape_constraints):
self.op_list = op_list
self.params = params
self.return_size = return_size
self.shape_constraints = shape_constraints
def trace(func, args):
history = TraceImpl()
try:
_thread_local.history = history
# Call `func(args)` and update `op_list`.
out_params, return_size = history.trace(func, args)
finally:
_thread_local.history = None
op_list = history.op_list
shape_constraints = history.shape_constraints
all_variables = history.vc.all_variables
op_list = _fusion_optimization.optimize(
op_list, all_variables, shape_constraints)
# Make info passed to FusedKernel.
kernel_params = _VariableSet()
for p in out_params:
kernel_params += _get_ancestors_of_trace_variable(p)
for op in op_list:
for p in op.in_params + op.out_params:
kernel_params += _get_ancestors_of_trace_variable(p)
kernel_params = list(kernel_params)
# used in mock tests.
history.kernel_params = kernel_params
history.op_list = op_list
return _TraceResult(op_list, kernel_params, return_size, shape_constraints)
cdef class _AbstractDim:
cdef:
readonly int input_index
readonly int axis
import string
import numpy
from cupy._core import _fusion_interface
from cupy._core._scalar cimport get_typename
cdef class _AbstractDim:
"""An abstracted data structure for a length of dimensions.
Attributes:
input_index (int):
The position of the element in the arguments passed to the
fused function
axis (int):
The index of dimensions
"""
def __init__(self, int input_index, int axis):
self.input_index = input_index
self.axis = axis
def __hash__(self):
return hash((self.input_index, self.axis))
def __eq__(self, object other):
if isinstance(other, int):
return False
return (
self.input_index == other.input_index
and self.axis == other.axis
)
class _MemorySpace:
"""A memory space object.
Attributes:
id(int): The serial number of memory space.
base_serial_number(int): The serial number of the base variable
which have this memory space.
is_input(bool): If this is set to ``True``, the memory space is
already allocated as an input array. If this is set to ``False``,
the memory space should be allocated before launching the kernel.
is_output(bool): If this is set to ``True``, the memory space is
used in the return values.
"""
def __init__(self, memory_id, base_serial_number):
assert isinstance(memory_id, int)
assert isinstance(base_serial_number, int)
self.id = memory_id
self.base_serial_number = base_serial_number
# Initially, these attributes are set to be `False`, but might be
# updated from outside.
self.is_input = False
self.is_output = False
@property
def is_inout(self):
"""Returns ``True`` if the memory space is used for inputs or outputs.
If ``True``, the memory space should not be deallocated just after
the kernel launch. If ``False``, the memory space is used only for
temporary value in the fused kernel."""
return self.is_input or self.is_output
class _TraceVariable:
"""Variable object to trace operations in the target function to be fused.
Attributes:
index(_MemorySpace): The memory space the variable uses.
serial_number(int): The serial number of the variable object.
dtype(dtype): The dtype of the variable.
rshape(tuple of int): The real shape of the variable.
ashape(tuple of _AbstractDim): An abstracted shape of the variable.
input_index(int or None): If not `None`, this variable is used as
the `input_index`-th input parameter.
output_index(int or None): If not `None`, this variable is used as
the `output_index`-th output parameter.
"""
def __init__(
self, memory_space, serial_number, dtype, rshape, ashape,
input_index, output_index):
assert isinstance(memory_space, _MemorySpace)
assert isinstance(serial_number, int)
assert isinstance(dtype, numpy.dtype)
assert input_index is None or isinstance(input_index, int)
assert output_index is None or isinstance(output_index, int)
assert isinstance(rshape, tuple)
assert isinstance(ashape, tuple)
assert len(rshape) == len(ashape)
for rdim, adim in zip(rshape, ashape):
assert isinstance(rdim, int)
assert isinstance(adim, (int, _AbstractDim))
self.memory = memory_space
self.serial_number = serial_number
self.dtype = dtype
self.rshape = rshape
self.ashape = ashape
self.input_index = input_index
self.output_index = output_index
@property
def ndim(self):
return len(self.ashape)
@property
def is_base(self):
return self.serial_number == self.memory.base_serial_number
@property
def is_input(self):
return self.input_index is not None
@property
def is_output(self):
return self.output_index is not None
@property
def var_name(self):
# The name of varialbe stored in global memory space.
raise NotImplementedError
@property
def lvar_name(self):
# The name of varialbe stored in registers in each thread.
raise NotImplementedError
@property
def indexer_name(self):
"""The name of CUDA CIndxer variable for the variable.
"""
# TODO(asi1024): Unify indexer with other variables which have the
# same shape, for performance improvements.
return 'ind{}_{}'.format(self.memory.id, self.serial_number)
def format(self, form, **kwargs):
"""Returns a string following the format taken as an input.
"""
kwargs = dict([
(k, get_typename(v) if isinstance(v, numpy.dtype) else v)
for k, v in kwargs.items()]
)
return string.Template(form).substitute(
type=get_typename(self.dtype),
var=self.var_name,
lvar=self.lvar_name,
indexer=self.indexer_name,
**kwargs
)
def __hash__(self):
assert False, (
'__hash__ is not defined. Use _VariableSet instead of '
'set/dict because they do not guarantee the order of contents.')
class _TraceScalar(_TraceVariable):
"""An abstracted scalar object.
Attributes:
const_value(scalar object or None): A compile-time constant value.
Actually, it is `None` iff self.is_input is `True`.
"""
# TODO(asi1024): Remove index argument.
def __init__(
self, index, serial_number, dtype, input_index=None, *,
const_value=None,):
super().__init__(
index, serial_number, dtype, (), (), input_index, None)
self.const_value = const_value
@property
def var_name(self):
if self.const_value is None:
return 'a{}'.format(self.memory.id)
if self.dtype == '?':
return str(self.const_value).lower()
if self.dtype.kind == 'c':
return '{}({}, {})'.format(
get_typename(self.dtype),
self.const_value.real,
self.const_value.imag)
return str(self.const_value)
@property
def lvar_name(self):
return 'v{}'.format(self.memory.id)
def as_interface(self):
return _fusion_interface._ScalarProxy(self)
def key(self):
return (self.memory.id,)
class _TraceArray(_TraceVariable):
"""An abstracted array object.
Attributes:
broadcasted_from(_TraceArray optional): TODO
rotated_from(_TraceArray optional): TODO
axis(int optional): The axis to rotate.
indexed_from(_TraceArray optional): TODO
index_key(slice): TODO
"""
def __init__(
self, index, serial_number, dtype, input_index=None,
output_index=None, *, rshape, ashape, **kwargs):
if ashape is None:
assert input_index is not None
ndim = len(rshape)
ashape = tuple([
_AbstractDim(input_index, axis) for axis in range(ndim)])
super().__init__(
index, serial_number, dtype, rshape, ashape,
input_index, output_index)
self._view_of = None
self.is_broadcast = False
self.rotate_axis = None
self.slice_key = None
if 'broadcasted_from' in kwargs:
self._view_of = kwargs.pop('broadcasted_from')
self.is_broadcast = True
elif 'rotated_from' in kwargs:
self._view_of = kwargs.pop('rotated_from')
self.rotate_axis = kwargs.pop('axis')
elif 'indexed_from' in kwargs:
self._view_of = kwargs.pop('indexed_from')
self.slice_key = kwargs.pop('index_key')
assert len(kwargs) == 0, kwargs
@property
def var_name(self):
return 'a{}_{}'.format(self.memory.id, self.serial_number)
@property
def lvar_name(self):
return 'v{}_{}'.format(self.memory.id, self.serial_number)
def as_interface(self):
return _fusion_interface._ArrayProxy(self)
def make_view(self, serial_number, **kwargs):
rshape = kwargs.pop('rshape', self.rshape)
ashape = kwargs.pop('ashape', self.ashape)
return _TraceArray(
self.memory, serial_number, self.dtype,
rshape=rshape, ashape=ashape, **kwargs)
def key(self):
"""Two variables can be identified if they have the same key.
"""
if isinstance(self.slice_key, tuple):
slice_key = []
for s in self.slice_key:
if isinstance(s, slice):
if not (s.start is None
and s.stop is None
and s.step in (None, 1, -1)):
raise NotImplementedError(
'Basic slice supports only x[::] and x[::-1].')
slice_key.append((s.start, s.stop, s.step))
else:
slice_key.append(s)
slice_key = tuple(slice_key)
else:
slice_key = self.slice_key
return (
self.memory.id, self.ashape, self.input_index,
getattr(self._view_of, 'serial_number', None),
self.is_broadcast, self.rotate_axis, slice_key,
)
class _VariableSet:
"""A stable set of variables
"""
def __init__(self, *args):
self.contents = []
for x in args:
assert isinstance(x, _TraceVariable)
if x not in self.contents:
self.contents.append(x)
def __len__(self):
return len(self.contents)
def item(self):
assert len(self.contents) == 1
return self.contents[0]
def add(self, x):
if x not in self.contents:
self.contents.append(x)
def __iadd__(self, other):
assert isinstance(other, _VariableSet)
for x in other.contents:
self.add(x)
return self
def __add__(self, other):
res = _VariableSet(*self.contents)
res += other
return res
def __contains__(self, elem):
return elem in self.contents
def __iter__(self):
return iter(self.contents)
def __isub__(self, other):
assert isinstance(other, _VariableSet)
for x in other.contents:
if x in self.contents:
self.contents.remove(x)
return self
def __sub__(self, other):
res = _VariableSet(*self.contents)
res -= other
return res
import re
import numpy
import cupy
import cupy._core._routines_manipulation as _manipulation
from cupy._core._dtype import get_dtype, _raise_if_invalid_cast
from cupy._core import internal
# Signature parsing code and dimension accessing has been borrowed
# from dask
# https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L12-L55
_DIMENSION_NAME = r'\w+\?*'
_CORE_DIMENSION_LIST = '(?:{0:}(?:,{0:})*,?)?'.format(_DIMENSION_NAME)
_ARGUMENT = r'\({}\)'.format(_CORE_DIMENSION_LIST)
_INPUT_ARGUMENTS = '(?:{0:}(?:,{0:})*,?)?'.format(_ARGUMENT)
_OUTPUT_ARGUMENTS = '{0:}(?:,{0:})*'.format(
_ARGUMENT
) # Use `'{0:}(?:,{0:})*,?'` if gufunc-
# signature should be allowed for length 1 tuple returns
_SIGNATURE = '^{0:}->{1:}$'.format(_INPUT_ARGUMENTS, _OUTPUT_ARGUMENTS)
def _parse_gufunc_signature(signature):
# The code has been modifyed from dask to support optional dimensions
if not isinstance(signature, str):
raise TypeError('Signature is not a string')
if signature == '' or signature is None:
raise ValueError('Signature cannot be empty')
signature = signature.replace(' ', '')
if not re.match(_SIGNATURE, signature):
raise ValueError('Not a valid gufunc signature: {}'.format(signature))
in_txt, out_txt = signature.split('->')
ins = [tuple(x.split(',')) if x != '' else ()
for x in in_txt[1:-1].split('),(')]
outs = [tuple(y.split(',')) if y != '' else ()
for y in out_txt[1:-1].split('),(')]
# TODO(ecastill) multiple output support
if len(outs) > 1:
raise ValueError('Currently more than 1 output is not supported')
return ins, outs
def _validate_normalize_axes(
axes, axis, keepdims, input_coredimss, output_coredimss
):
# This code credit goes to Dask
# https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L58-L172
nin = len(input_coredimss)
nout = (
1 if not isinstance(output_coredimss, list) else len(output_coredimss)
)
if axes is not None and axis is not None:
raise ValueError(
'Only one of `axis` or `axes` keyword arguments should be given')
if axes and not isinstance(axes, list):
raise ValueError('`axes` has to be of type list')
# output_coredimss = output_coredimss if nout > 1 else [output_coredimss]
filtered_core_dims = list(filter(len, input_coredimss))
nr_outputs_with_coredims = len(
[True for x in output_coredimss if len(x) > 0])
if keepdims:
if nr_outputs_with_coredims > 0:
raise ValueError('`keepdims` can only be used for scalar outputs')
output_coredimss = len(output_coredimss) * [filtered_core_dims[0]]
core_dims = input_coredimss + output_coredimss
if axis is not None:
if not isinstance(axis, int):
raise ValueError('`axis` argument has to be an integer value')
if filtered_core_dims:
cd0 = filtered_core_dims[0]
if len(cd0) != 1:
raise ValueError(
'`axis` can be used only, if one core dimension is present'
)
for cd in filtered_core_dims:
if cd0 != cd:
raise ValueError(
'To use `axis`, all core dimensions have to be equal'
)
# Expand defaults or axis
if axes is None:
if axis is not None:
axes = [(axis,) if cd else tuple() for cd in core_dims]
else:
axes = [tuple(range(-len(icd), 0)) for icd in core_dims]
axes = [(a,) if isinstance(a, int) else a for a in axes]
if (
(nr_outputs_with_coredims == 0)
and (nin != len(axes))
and (nin + nout != len(axes))
) or ((nr_outputs_with_coredims > 0) and (nin + nout != len(axes))):
raise ValueError(
'The number of `axes` entries is not equal the number'
' of input and output arguments')
# Treat outputs
output_axes = axes[nin:]
output_axes = (
output_axes
if output_axes
else [tuple(range(-len(ocd), 0)) for ocd in output_coredimss]
)
input_axes = axes[:nin]
# Assert we have as many axes as output core dimensions
for idx, (iax, icd) in enumerate(zip(input_axes, input_coredimss)):
if len(iax) != len(icd):
raise ValueError(
f'The number of `axes` entries for argument #{idx}'
' is not equal the number of respective input core'
' dimensions in signature')
if not keepdims:
for idx, (oax, ocd) in enumerate(zip(output_axes, output_coredimss)):
if len(oax) != len(ocd):
raise ValueError(
f'The number of `axes` entries for argument #{idx}'
' is not equal the number of respective output core'
' dimensions in signature')
else:
if input_coredimss:
icd0 = input_coredimss[0]
for icd in input_coredimss:
if icd0 != icd:
raise ValueError(
'To use `keepdims`, all core dimensions'
' have to be equal')
iax0 = input_axes[0]
output_axes = [iax0 for _ in output_coredimss]
return input_axes, output_axes
class _OpsRegister:
'''
Holds the ops for each dtypes signature like ('ff->f', func1)
and allows to do look ups for these
'''
class _Op:
def __init__(self, in_types, out_types, func):
self.func = func
self.in_types = tuple(numpy.dtype(i) for i in in_types)
self.out_types = tuple(numpy.dtype(o) for o in out_types)
self.sig_str = (''.join(
in_t.char for in_t in self.in_types) + '->' + ''.join(
out_t.char for out_t in self.out_types))
def __init__(self, signatures, default_func, nin, nout, name):
self._default_func = default_func
self._nin = nin
self._nout = nout
self._ops = self._process_signatures(signatures)
self._name = name
def _sig_str_to_tuple(self, sig):
sig = sig.replace(' ', '')
toks = sig.split('->')
if len(toks) != 2:
raise ValueError(f'signature {sig} for dtypes is invalid')
else:
ins, outs = toks
return ins, outs
def _process_signatures(self, signatures):
ops = []
for sig in signatures:
if isinstance(sig, tuple):
sig, op = sig
else:
op = self._default_func
ins, outs = self._sig_str_to_tuple(sig)
# Check the number of inputs and outputs matches the gufunc sig
if len(ins) != self._nin:
raise ValueError(
f'signature {sig} for dtypes is invalid number of inputs '
'is not consistent with general signature')
if len(outs) != self._nout:
raise ValueError(
f'signature {sig} for dtypes is invalid number of inputs '
'is not consistent with general signature')
ops.append(_OpsRegister._Op(ins, outs, op))
return ops
def _determine_from_args(self, args, casting):
n = len(args)
in_types = tuple(arg.dtype for arg in args)
for op in self._ops:
op_types = op.in_types
for i in range(n):
it = in_types[i]
ot = op_types[i]
if not numpy.can_cast(it, ot, casting=casting):
break
else:
return op
return None
def _determine_from_dtype(self, dtype):
for op in self._ops:
op_types = op.out_types
for t in op_types:
if t != dtype:
break
else:
return op
return None
def _determine_from_signature(self, signature):
# Lets convert the signature as it can be a tuple of tuples
# or a string
if isinstance(signature, tuple):
# create a string to do a look-up on the ops
if len(signature) == 1:
raise TypeError(
'The use of a length 1 tuple for the ufunc `signature` is'
' not allowed. Use `dtype` or fill the tuple with'
' `None`s.')
nin = self._nin
nout = self._nout
if len(signature) != (nin + nout):
raise TypeError(
'A type-tuple must be specified of length 1 or 3 for ufunc'
f' {self._name}')
signature = ''.join(
numpy.dtype(t).char for t in signature[:nin]) + '->' + ''.join(
numpy.dtype(t).char for t in signature[nin:nin+nout])
if isinstance(signature, str):
is_out = len(signature) == 1
for op in self._ops:
if is_out:
for t in op.out_types:
if t.char != signature:
break
else:
return op
else:
if op.sig_str == signature:
return op
raise TypeError('No loop matching the specified signature and'
f' casting was found for ufunc {self._name}')
def determine_dtype(self, args, dtype, casting, signature):
ret_dtype = None
func = self._default_func
if signature is not None:
# TODO(ecastill) use an externally provided signature to
# find the typecasting rules
op = self._determine_from_signature(signature)
elif dtype is not None:
if type(dtype) == tuple:
# TODO(ecastill) support dtype tuples
raise RuntimeError('dtype with tuple is not yet supported')
op = self._determine_from_dtype(dtype)
else:
op = self._determine_from_args(args, casting)
if op is None:
# Should we allow op to be none?
if dtype is None:
dtype = args[0].dtype
for arg in args:
ret_dtype = numpy.promote_types(dtype, arg.dtype)
else:
ret_dtype = get_dtype(dtype)
else:
# Convert args to the op specified in_types
n_args = []
def argname(): return f'ufunc {self._name} input {i}'
for i, (arg, in_type) in enumerate(zip(args, op.in_types)):
_raise_if_invalid_cast(arg.dtype, in_type, casting, argname)
n_args.append(arg.astype(in_type, copy=False))
args = n_args
ret_dtype = op.out_types[0]
func = op.func
return args, ret_dtype, func
class _GUFunc:
'''
Creates a Generalized Universal Function by wrapping a user
provided function with the signature.
``signature`` determines if the function consumes or produces core
dimensions. The remaining dimensions in given input arrays (``*args``)
are considered loop dimensions and are required to broadcast
naturally against each other.
Args:
func (callable):
Function to call like ``func(*args, **kwargs)`` on input arrays
(``*args``) that returns an array or tuple of arrays. If
multiple arguments with non-matching dimensions are supplied,
this function is expected to vectorize (broadcast) over axes of
positional arguments in the style of NumPy universal functions.
signature (string):
Specifies what core dimensions are consumed and produced by
``func``. According to the specification of numpy.gufunc
signature.
supports_batched (bool, optional):
If the wrapped function supports to pass the complete input
array with the loop and the core dimensions.
Defaults to `False`. Dimensions will be iterated in the
`GUFunc` processing code.
supports_out (bool, optional):
If the wrapped function supports out as one of its kwargs.
Defaults to `False`.
signatures (list of tuple of str):
Contains strings in the form of 'ii->i' with i being the char of a
dtype. Each element of the list is a tuple with the string
and a alternative function to `func` to be executed when the inputs
of the function can be casted as described by this function.
name (str, optional):
Name for the GUFunc object. If not specified, ``func``'s name
is used.
doc (str, optional):
Docstring for the GUFunc object. If not specified, ``func.__doc__``
is used.
'''
def __init__(self, func, signature, **kwargs):
# We would like to create gufuncs from cupy regular ufuncs
# so we can avoid most of the __call__ stuff
self._func = func
self._signature = signature
self.__name__ = kwargs.pop('name', func.__name__)
self.__doc__ = kwargs.pop('doc', func.__doc__)
# The following are attributes to avoid applying certain steps
# when wrapping cupy functions that do some of the gufunc
# stuff internally due to CUDA libraries requirements
self._supports_batched = kwargs.pop('supports_batched', False)
self._supports_out = kwargs.pop('supports_out', False)
signatures = kwargs.pop('signatures', [])
if kwargs:
raise TypeError(
'got unexpected keyword arguments: '
+ ', '.join([repr(k) for k in kwargs])
)
# Preprocess the signature here
input_coredimss, output_coredimss = _parse_gufunc_signature(
self._signature)
self._input_coredimss = input_coredimss
self._output_coredimss = output_coredimss
# This is pre-calculated to later check the minimum number of
# dimensions required per input
self._min_dims = [0] * len(input_coredimss)
for i, inp in enumerate(input_coredimss):
for d in inp:
if d[-1] != '?':
self._min_dims[i] += 1
# Determine nout: nout = None for functions of one
# direct return; nout = int for return tuples
self._nout = (
0
if not isinstance(output_coredimss, list)
else len(output_coredimss)
)
self._nin = (
0
if not isinstance(input_coredimss, list)
else len(input_coredimss)
)
# Determines the function that will be run depending on the datatypes
# Pass a list of signatures that are either the types in format
# ii->o or a tuple with the string and a function other than func to be
# executed for those types
# For some reason _nout is a tuple and now we get it with 0s
self._ops_register = _OpsRegister(
signatures, self._func, self._nin, self._nout, self.__name__)
def _apply_func_to_inputs(self, func, dim, sizes, dims, args, outs):
# Apply function
# The resulting array is loop_output_dims+the specified dims
# Some functions have batching logic inside due to higly
# optimized CUDA libraries so we just call them
if self._supports_batched or dim == len(dims):
# Check if the function supports out, order and other args
if self._supports_out and outs is not None:
outs = outs[0] if len(outs) == 1 else outs
func(*args, out=outs)
else:
fouts = func(*args)
# TODO(ecastill) improve this check
if isinstance(fouts, cupy.ndarray):
fouts = (fouts,)
for o, fo in zip(outs, fouts):
cupy._core.elementwise_copy(fo, o)
else:
dim_size = sizes[dims[dim]][0]
for i in range(dim_size):
n_args = [a[i] for a in args]
if outs is not None:
n_outs = [o[i] for o in outs]
self._apply_func_to_inputs(
func, dim + 1, sizes, dims, n_args, n_outs)
def _transpose_element(self, arg, iax, shape):
iax = tuple(a if a < 0 else a - len(shape) for a in iax)
tidc = (
tuple(i for i in range(
-len(shape) + 0, 0) if i not in iax) + iax
)
return arg.transpose(tidc)
def _get_args_transposed(self, args, input_axes, outs, output_axes):
# This code credit goes to Dask
# https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L349-L377
# modifications have been done to support arguments broadcast
# out argument, and optional core dims.
transposed_args = []
# This is used when reshaping the outputs so that we can delete
# dims that were not specified in the input
missing_dims = set()
for i, (arg, iax, input_coredims, md) in enumerate(zip(
args, input_axes, self._input_coredimss, self._min_dims)):
shape = arg.shape
nds = len(shape)
# For the inputs that has missing dimensions we need to reshape
if nds < md:
raise ValueError(f'Input operand {i} does not have enough'
f' dimensions (has {nds}, gufunc core with'
f' signature {self._signature} requires {md}')
optionals = len(input_coredims) - nds
if optionals > 0:
# Look for optional dimensions
# We only allow the first or the last dimensions to be optional
if input_coredims[0][-1] == '?':
shape = (1,) * optionals + shape
missing_dims.update(set(input_coredims[:optionals]))
else:
shape = shape + (1,) * optionals
missing_dims.update(
set(input_coredims[min(0, len(shape)-1):]))
arg = arg.reshape(shape)
transposed_args.append(self._transpose_element(arg, iax, shape))
args = transposed_args
if outs is not None:
transposed_outs = []
# outs should be transposed to the intermediate form before
# copying all results
for out, iox, coredims in zip(
outs, output_axes, self._output_coredimss):
transposed_outs.append(self._transpose_element(
out, iox, out.shape))
# check that outs has been correctly transposed
# if the function returns a scalar, outs will be ignored
if len(transposed_outs) == len(outs):
outs = transposed_outs
# we cant directly broadcast arrays together since their core dims
# might differ. Only the loop dimensions are broadcastable
shape = internal._broadcast_shapes(
[a.shape[:-len(self._input_coredimss)] for a in args])
args = [_manipulation.broadcast_to(
a, shape + a.shape[-len(self._input_coredimss):]) for a in args]
# Assess input args for loop dims
input_shapes = [a.shape for a in args]
num_loopdims = [
len(s) - len(cd) for s, cd in zip(
input_shapes, self._input_coredimss)
]
max_loopdims = max(num_loopdims) if num_loopdims else None
core_input_shapes = [
dict(zip(icd, s[n:]))
for s, n, icd in zip(
input_shapes, num_loopdims, self._input_coredimss)
]
core_shapes = {}
for d in core_input_shapes:
core_shapes.update(d)
loop_input_dimss = [
tuple(
'__loopdim%d__' % d for d in range(
max_loopdims - n, max_loopdims)
)
for n in num_loopdims
]
input_dimss = [li + c for li, c in zip(
loop_input_dimss, self._input_coredimss)]
loop_output_dims = max(loop_input_dimss, key=len, default=())
# Assess input args for same size and chunk sizes
# Collect sizes and chunksizes of all dims in all arrays
dimsizess = {}
for dims, shape in zip(input_dimss, input_shapes):
for dim, size in zip(dims, shape):
dimsizes = dimsizess.get(dim, [])
dimsizes.append(size)
dimsizess[dim] = dimsizes
# Assert correct partitioning, for case:
for dim, sizes in dimsizess.items():
if set(sizes).union({1}) != {1, max(sizes)}:
raise ValueError(
f'Dimension {dim} with different lengths in arrays'
)
return args, dimsizess, loop_output_dims, outs, missing_dims
def _determine_order(self, args, order):
if order.upper() in ('C', 'K'):
# Order is determined to be C to allocate the out array
# but we will change the strides of the out array
# to be K later in __call__
return 'C'
elif order.upper() == 'A':
# order is F if all arrays are strictly F
order = ('F' if all([a.flags.f_contiguous
and not a.flags.c_contiguous
for a in args]) else 'C')
return order
elif order.upper() == 'F':
return 'F'
else:
raise RuntimeError(f'Unknown order {order}')
def __call__(self, *args, **kwargs):
'''
Apply a generalized ufunc.
Args:
args: Input arguments. Each of them can be a :class:`cupy.ndarray`
object or a scalar. The output arguments can be omitted or be
specified by the ``out`` argument.
axes (List of tuples of int, optional):
A list of tuples with indices of axes a generalized ufunc
should operate on.
For instance, for a signature of ``'(i,j),(j,k)->(i,k)'``
appropriate for matrix multiplication, the base elements are
two-dimensional matrices and these are taken to be stored in
the two last axes of each argument. The corresponding
axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
For simplicity, for generalized ufuncs that operate on
1-dimensional arrays (vectors), a single integer is accepted
instead of a single-element tuple, and for generalized ufuncs
for which all outputs are scalars, the output tuples
can be omitted.
axis (int, optional):
A single axis over which a generalized ufunc should operate.
This is a short-cut for ufuncs that operate over a single,
shared core dimension, equivalent to passing in axes with
entries of (axis,) for each single-core-dimension argument
and ``()`` for all others.
For instance, for a signature ``'(i),(i)->()'``, it is
equivalent to passing in ``axes=[(axis,), (axis,), ()]``.
keepdims (bool, optional):
If this is set to True, axes which are reduced over will be
left in the result as a dimension with size one, so that the
result will broadcast correctly against the inputs. This
option can only be used for generalized ufuncs that operate
on inputs that all have the same number of core dimensions
and with outputs that have no core dimensions , i.e., with
signatures like ``'(i),(i)->()'`` or ``'(m,m)->()'``.
If used, the location of the dimensions in the output can
be controlled with axes and axis.
casting (str, optional):
Provides a policy for what kind of casting is permitted.
Defaults to ``'same_kind'``
dtype (dtype, optional):
Overrides the dtype of the calculation and output arrays.
Similar to signature.
signature (str or tuple of dtype, optional):
Either a data-type, a tuple of data-types, or a special
signature string indicating the input and output types of a
ufunc. This argument allows you to provide a specific
signature for the function to be used if registered in the
``signatures`` kwarg of the ``__init__`` method.
If the loop specified does not exist for the ufunc, then
a TypeError is raised. Normally, a suitable loop is found
automatically by comparing the input types with what is
available and searching for a loop with data-types to
which all inputs can be cast safely. This keyword argument
lets you bypass that search and choose a particular loop.
order (str, optional):
Specifies the memory layout of the output array. Defaults to
``'K'``.``'C'`` means the output should be C-contiguous,
``'F'`` means F-contiguous, ``'A'`` means F-contiguous
if the inputs are F-contiguous and not also not C-contiguous,
C-contiguous otherwise, and ``'K'`` means to match the element
ordering of the inputs as closely as possible.
out (cupy.ndarray): Output array. It outputs to new arrays
default.
Returns:
Output array or a tuple of output arrays.
'''
# This argument cannot be used for generalized ufuncs
# as those take non-scalar input.
# where = kwargs.pop('where', None)
outs = kwargs.pop('out', None)
axes = kwargs.pop('axes', None)
axis = kwargs.pop('axis', None)
order = kwargs.pop('order', 'K')
dtype = kwargs.pop('dtype', None)
keepdims = kwargs.pop('keepdims', False)
signature = kwargs.pop('signature', None)
casting = kwargs.pop('casting', 'same_kind')
if len(kwargs) > 0:
raise RuntimeError(
'Unknown kwargs {}'.format(' '.join(kwargs.keys())))
ret_dtype = None
func = self._func
# this will cast the inputs appropiately
args, ret_dtype, func = self._ops_register.determine_dtype(
args, dtype, casting, signature)
if not type(self._signature) == str:
raise TypeError('`signature` has to be of type string')
if outs is not None and type(outs) != tuple:
if isinstance(outs, cupy.ndarray):
outs = (outs,)
else:
raise TypeError('`outs` must be a tuple or `cupy.ndarray`')
filter_order = self._determine_order(args, order)
input_coredimss = self._input_coredimss
output_coredimss = self._output_coredimss
if outs is not None and type(outs) != tuple:
raise TypeError('`outs` must be a tuple')
# Axes
input_axes, output_axes = _validate_normalize_axes(
axes, axis, keepdims, input_coredimss, output_coredimss
)
if len(input_coredimss) != len(args):
ValueError(
'According to `signature`, `func` requires %d arguments,'
' but %s given' % (len(input_coredimss), len(args)))
args, dimsizess, loop_output_dims, outs, m_dims = self._get_args_transposed( # NOQA
args, input_axes, outs, output_axes)
# The output shape varies depending on optional dims or not
# TODO(ecastill) this only works for one out argument
out_shape = [dimsizess[od][0] for od in loop_output_dims]
if self._nout > 0:
out_shape += [dimsizess[od][0] for od in output_coredimss[0]]
out_shape = tuple(out_shape)
if outs is None:
outs = cupy.empty(out_shape, dtype=ret_dtype, order=filter_order)
if order == 'K':
strides = internal._get_strides_for_order_K(
outs, ret_dtype, out_shape)
outs._set_shape_and_strides(out_shape, strides, True, True)
outs = (outs,)
else:
if outs[0].shape != out_shape:
raise ValueError(f'Invalid shape for out {outs[0].shape}'
f' needs {out_shape}')
_raise_if_invalid_cast(
ret_dtype, outs[0].dtype, casting, "out dtype")
self._apply_func_to_inputs(
func, 0, dimsizess, loop_output_dims, args, outs)
# This code credit goes to Dask
# https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L462-L503
# Treat direct output
if self._nout == 0:
output_coredimss = [output_coredimss]
# Split output
# tmp might be a tuple of outs
# we changed the way we apply the function compared to dask
# we have added support for optional dims
leaf_arrs = []
for tmp in outs:
for i, (ocd, oax) in enumerate(zip(output_coredimss, output_axes)):
leaf_arr = tmp
# Axes:
if keepdims:
slices = (len(leaf_arr.shape) * (slice(None),)
+ len(oax) * (numpy.newaxis,))
leaf_arr = leaf_arr[slices]
tidcs = [None] * len(leaf_arr.shape)
for i, oa in zip(range(-len(oax), 0), oax):
tidcs[oa] = i
j = 0
for i in range(len(tidcs)):
if tidcs[i] is None:
tidcs[i] = j
j += 1
leaf_arr = leaf_arr.transpose(tidcs)
# Delete the dims that were optionals after the input expansion
if len(m_dims) > 0:
shape = leaf_arr.shape
# This line deletes the dimensions that were not present
# in the input
core_shape = shape[-len(ocd):]
core_shape = tuple([
d for d, n in zip(core_shape, ocd) if n not in m_dims])
shape = shape[:-len(ocd)] + core_shape
leaf_arr = leaf_arr.reshape(shape)
# leaf_arrs.append(leaf_arr.astype(leaf_arr.dtype, order=order)) # NOQA
leaf_arrs.append(leaf_arr)
return tuple(leaf_arrs) if self._nout > 1 else leaf_arrs[0]
from libcpp cimport vector
from cupy._core cimport _carray
from cupy._core cimport _scalar
from cupy._core._carray cimport shape_t
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport memory
from cupy.cuda cimport texture
cdef class ParameterInfo:
cdef:
readonly str name
readonly object dtype
readonly str ctype
readonly bint raw
readonly bint is_const
cdef enum _ArgKind:
ARG_KIND_NDARRAY = 1
ARG_KIND_INDEXER
ARG_KIND_SCALAR
ARG_KIND_POINTER
ARG_KIND_TEXTURE
cdef class _ArgInfo:
# Holds metadata of an argument.
# This class is immutable and used as a part of hash keys.
cdef:
readonly _ArgKind arg_kind
readonly type type
readonly object dtype
readonly int ndim
readonly bint c_contiguous
readonly bint index_32_bits
cdef _ArgInfo _init(
self,
_ArgKind arg_kind,
type typ,
object dtype,
int ndim,
bint c_contiguous,
bint index_32_bits)
@staticmethod
cdef _ArgInfo from_arg(object arg)
@staticmethod
cdef _ArgInfo from_ndarray(_ndarray_base arg)
@staticmethod
cdef _ArgInfo from_scalar(_scalar.CScalar arg)
@staticmethod
cdef _ArgInfo from_indexer(_carray.Indexer arg)
@staticmethod
cdef _ArgInfo from_memptr(memory.MemoryPointer arg)
@staticmethod
cdef _ArgInfo from_texture(texture.TextureObject arg)
cdef _ArgInfo as_ndarray_with_ndim(self, int ndim)
cdef bint is_ndarray(self)
cdef bint is_scalar(self)
cdef str get_c_type(self)
cdef str get_param_c_type(self, ParameterInfo p)
cdef str get_c_var_name(self, ParameterInfo p)
cdef class _TypeMap:
# Typedef mapping between C types.
# This class is immutable.
cdef:
tuple _pairs
cdef str get_typedef_code(self)
cdef class _Op:
"""Simple data structure that represents a kernel routine with single \
concrete dtype mapping.
"""
cdef:
readonly tuple in_types
readonly tuple out_types
readonly int nin
readonly int nout
readonly object routine
# If the type combination specified by in_types and out_types is
# disallowed, error_func must be set instead of routine.
# It's called by check_valid() method.
readonly object error_func
@staticmethod
cdef _Op _from_type_and_routine_or_error_func(
str typ, object routine, object error_func)
# Creates an op instance parsing a dtype mapping.
@staticmethod
cdef _Op from_type_and_routine(str typ, routine)
cpdef tuple get_in_dtypes(self)
cpdef tuple get_out_dtypes(self)
# Creates an op instance parsing a dtype mapping with given error function.
@staticmethod
cdef _Op from_type_and_error_func(str typ, error_func)
# Raises an error if error_func is given.
cdef check_valid(self)
cdef class _Ops:
"""A kernel routine representation with various dtype mappings.
"""
cdef:
readonly tuple ops
readonly int nin
readonly int nout
@staticmethod
cdef _Ops from_tuples(object ops, routine)
# Queries a single op from input arguments.
cpdef _Op guess_routine(
self, str name, dict cache, list in_args, dtype, _Ops out_ops)
cpdef _Op _guess_routine_from_in_types(
self, tuple in_types, object can_cast=*)
cpdef _Op _guess_routine_from_dtype(self, object dtype)
cpdef create_ufunc(name, ops, routine=*, preamble=*, doc=*,
default_casting=*, loop_prep=*, out_ops=*,
cutensor_op=*, scatter_op=*)
cdef tuple _get_arginfos(list args)
cdef str _get_kernel_params(tuple params, tuple arginfos)
cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape)
cdef list _get_out_args_from_optionals(
subtype, list out_args, tuple out_types, const shape_t& out_shape, casting,
obj)
cdef list _get_out_args_with_params(
list out_args, tuple out_types,
const shape_t& out_shape, tuple out_params, bint is_size_specified)
cdef _check_peer_access(_ndarray_base arr, int device_id)
cdef list _preprocess_args(int dev_id, args, bint use_c_scalar)
cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape)
import string
import warnings
import numpy
import cupy
from cupy.cuda import compiler
from cupy import _util
cimport cython # NOQA
from libcpp cimport vector
from cupy.cuda cimport device
from cupy.cuda cimport function
from cupy.cuda cimport memory
from cupy.cuda cimport texture
from cupy._core cimport _accelerator
from cupy._core cimport _carray
from cupy._core cimport _scalar
from cupy._core._dtype cimport get_dtype, _raise_if_invalid_cast
from cupy._core._memory_range cimport may_share_bounds
from cupy._core._scalar import get_typename as _get_typename
from cupy._core cimport core
from cupy._core.core cimport _convert_object_with_cuda_array_interface
from cupy._core.core cimport _ndarray_init
from cupy._core.core cimport compile_with_cache
from cupy._core.core cimport _ndarray_base
from cupy._core cimport internal
from cupy_backends.cuda.api cimport runtime
try:
import cupy_backends.cuda.libs.cutensor as cuda_cutensor
except ImportError:
cuda_cutensor = None
from cupy._core import _fusion_thread_local
cdef inline bint _contains_zero(const shape_t& v) except? -1:
for i in range(v.size()):
if v[i] == 0:
return True
return False
@_util.memoize(for_each_device=True)
def _get_warpsize():
device_id = runtime.getDevice()
return runtime.getDeviceProperties(device_id)['warpSize']
cdef str _get_simple_elementwise_kernel_code(
tuple params, tuple arginfos, str operation, str name,
_TypeMap type_map, str preamble, str loop_prep='', str after_loop=''):
# No loop unrolling due to avoid 64-bit division
module_code = string.Template('''
${typedef_preamble}
${preamble}
extern "C" __global__ void ${name}(${params}) {
${loop_prep};
#pragma unroll 1
CUPY_FOR(i, _ind.size()) {
_ind.set(i);
${operation};
}
${after_loop};
}
''').substitute(
typedef_preamble=type_map.get_typedef_code(),
params=_get_kernel_params(params, arginfos),
operation=operation,
name=name,
preamble=preamble,
loop_prep=loop_prep,
after_loop=after_loop)
return module_code
cdef function.Function _get_simple_elementwise_kernel_from_code(
str name, str code, tuple options=()):
module = compile_with_cache(code, options)
return module.get_function(name)
cdef function.Function _get_simple_elementwise_kernel(
tuple params, tuple arginfos, str operation, str name,
_TypeMap type_map, str preamble, str loop_prep='', str after_loop='',
tuple options=()):
code = _get_simple_elementwise_kernel_code(
params, arginfos, operation, name, type_map, preamble, loop_prep,
after_loop
)
return _get_simple_elementwise_kernel_from_code(name, code, options)
cdef inline int _get_kind_score(int kind):
if b'b' == kind:
return 0
if b'u' == kind or b'i' == kind:
return 1
if b'f' == kind or b'c' == kind:
return 2
return -1
@cython.profile(False)
cdef inline _check_peer_access(_ndarray_base arr, int device_id):
if arr.data.device_id == device_id:
return
msg = (
f'The device where the array resides ({arr.data.device_id}) is '
f'different from the current device ({device_id}).'
)
cdef bint peer_access = device._enable_peer_access(
device_id, arr.data.device_id)
if not peer_access:
raise ValueError(
f'{msg} Peer access is unavailable between these devices.')
warnings.warn(
f'{msg} Peer access has been activated automatically.',
_util.PerformanceWarning)
cdef inline _preprocess_arg(int dev_id, arg, bint use_c_scalar):
if isinstance(arg, _ndarray_base):
s = arg
_check_peer_access(<_ndarray_base>s, dev_id)
elif isinstance(arg, texture.TextureObject):
s = arg
elif hasattr(arg, '__cuda_array_interface__'):
s = _convert_object_with_cuda_array_interface(arg)
_check_peer_access(<_ndarray_base>s, dev_id)
elif hasattr(arg, '__cupy_get_ndarray__'):
s = arg.__cupy_get_ndarray__()
_check_peer_access(<_ndarray_base>s, dev_id)
else: # scalars or invalid args
if use_c_scalar:
s = _scalar.scalar_to_c_scalar(arg)
else:
s = _scalar.scalar_to_numpy_scalar(arg)
if s is None:
raise TypeError('Unsupported type %s' % type(arg))
return s
cdef list _preprocess_args(int dev_id, args, bint use_c_scalar):
"""Preprocesses arguments for kernel invocation
- Checks device compatibility for ndarrays
- Converts Python/NumPy scalars:
- If use_c_scalar is True, into CScalars.
- If use_c_scalar is False, into NumPy scalars.
"""
cdef list ret = []
for arg in args:
ret.append(_preprocess_arg(dev_id, arg, use_c_scalar))
return ret
cdef list _preprocess_optional_args(int dev_id, args, bint use_c_scalar):
"""Preprocesses arguments for kernel invocation
- Checks device compatibility for ndarrays
- Converts Python/NumPy scalars:
- If use_c_scalar is True, into CScalars.
- If use_c_scalar is False, into NumPy scalars.
"""
cdef list ret = []
for arg in args:
if arg is None:
ret.append(None)
else:
ret.append(_preprocess_arg(dev_id, arg, use_c_scalar))
return ret
cdef class _ArgInfo:
# Holds metadata of an argument.
# This class is immutable and used as a part of hash keys.
def __init__(self, *args):
arg_kind, typ, dtype, ndim, c_contiguous, index_32_bits = args
self._init(arg_kind, typ, dtype, ndim, c_contiguous, index_32_bits)
cdef _ArgInfo _init(
self,
_ArgKind arg_kind,
type typ,
object dtype,
int ndim,
bint c_contiguous,
bint index_32_bits):
self.arg_kind = arg_kind
self.type = typ
self.dtype = dtype
self.ndim = ndim
self.c_contiguous = c_contiguous
self.index_32_bits = index_32_bits
@staticmethod
cdef _ArgInfo from_arg(object arg):
typ = type(arg)
if issubclass(typ, _ndarray_base):
return _ArgInfo.from_ndarray(arg)
if typ is _scalar.CScalar:
return _ArgInfo.from_scalar(arg)
if typ is _carray.Indexer:
return _ArgInfo.from_indexer(arg)
if typ is memory.MemoryPointer:
return _ArgInfo.from_memptr(arg)
if typ is texture.TextureObject:
return _ArgInfo.from_texture(arg)
assert False, typ
@staticmethod
cdef _ArgInfo from_ndarray(_ndarray_base arg):
cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
ret._init(
ARG_KIND_NDARRAY,
type(arg),
arg.dtype.type,
arg._shape.size(),
arg._c_contiguous,
arg._index_32_bits)
return ret
@staticmethod
cdef _ArgInfo from_scalar(_scalar.CScalar arg):
cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
dtype = arg.get_numpy_type()
ret._init(ARG_KIND_SCALAR, _scalar.CScalar, dtype, 0, True, True)
return ret
@staticmethod
cdef _ArgInfo from_indexer(_carray.Indexer arg):
cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
ret._init(
ARG_KIND_INDEXER, _carray.Indexer, None, arg.ndim, True,
arg._index_32_bits)
return ret
@staticmethod
cdef _ArgInfo from_memptr(memory.MemoryPointer arg):
cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
ret._init(
ARG_KIND_POINTER, memory.MemoryPointer, None, 0, True, True)
return ret
@staticmethod
cdef _ArgInfo from_texture(texture.TextureObject arg):
cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
ret._init(
ARG_KIND_TEXTURE, texture.TextureObject, None, 0, True, True)
return ret
def __hash__(self):
return hash((self.arg_kind, self.type, self.dtype, self.ndim,
self.c_contiguous, self.index_32_bits))
def __eq__(self, other):
cdef _ArgInfo oth
if not isinstance(other, _ArgInfo):
return False
oth = other
return (
self.arg_kind == oth.arg_kind
and self.type is oth.type
and self.dtype == oth.dtype
and self.ndim == oth.ndim
and self.c_contiguous == oth.c_contiguous
and self.index_32_bits == oth.index_32_bits)
def __repr__(self):
return '<_ArgInfo({})>'.format(
' '.join([
'arg_kind={!r}'.format(self.arg_kind),
'type={!r}'.format(self.type),
'dtype={!r}'.format(self.dtype),
'ndim={!r}'.format(self.ndim),
'c_contiguous={!r}'.format(self.c_contiguous),
'index_32_bits={!r}'.format(self.index_32_bits),
]))
cdef _ArgInfo as_ndarray_with_ndim(self, int ndim):
# Returns an ndarray _ArgInfo with altered ndim.
# If ndim is the same, self is returned untouched.
assert self.arg_kind == ARG_KIND_NDARRAY
if self.ndim == ndim:
return self
return _ArgInfo(
ARG_KIND_NDARRAY, self.dtype, self.dtype, ndim, False, False)
cdef bint is_ndarray(self):
return self.arg_kind == ARG_KIND_NDARRAY
cdef bint is_scalar(self):
return self.arg_kind == ARG_KIND_SCALAR
cdef str get_c_type(self):
# Returns the C type representation.
if self.arg_kind == ARG_KIND_NDARRAY:
return 'CArray<%s, %d, %d, %d>' % (
_get_typename(self.dtype), self.ndim,
self.c_contiguous, self.index_32_bits)
if self.arg_kind == ARG_KIND_SCALAR:
return _get_typename(self.dtype)
if self.arg_kind == ARG_KIND_INDEXER:
return 'CIndexer<%d, %d>' % (self.ndim, self.index_32_bits)
if self.arg_kind == ARG_KIND_TEXTURE:
return 'cudaTextureObject_t'
assert False
cdef str get_param_c_type(self, ParameterInfo p):
# Returns the C type representation in the global function's
# parameter list.
cdef str ctyp = self.get_c_type()
if p.is_const:
return 'const ' + ctyp
return ctyp
cdef str get_c_var_name(self, ParameterInfo p):
if self.arg_kind in (ARG_KIND_NDARRAY, ARG_KIND_POINTER) and not p.raw:
return '_raw_' + p.name
return p.name
cdef tuple _get_arginfos(list args):
return tuple([_ArgInfo.from_arg(a) for a in args])
cdef str _get_kernel_params(tuple params, tuple arginfos):
cdef ParameterInfo p
cdef _ArgInfo arginfo
assert len(params) == len(arginfos)
lst = []
for i in range(len(params)):
p = params[i]
arginfo = arginfos[i]
lst.append('{} {}'.format(
arginfo.get_param_c_type(p),
arginfo.get_c_var_name(p)))
return ', '.join(lst)
cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape):
""" Remove contiguous stride to optimize CUDA kernel."""
cdef _ndarray_base arr
if shape.size() <= 1 or len(args) == 0:
return shape
if len(args) == 1: # fast path for reduction
a = args[0]
if (<ParameterInfo>params[0]).raw or not isinstance(a, _ndarray_base):
return shape
arr = a
arr = arr.reduced_view()
if arr is a:
return shape
else:
args[0] = arr
return arr._shape
return _reduced_view_core(args, params, shape)
cdef shape_t _reduced_view_core(list args, tuple params, const shape_t& shape):
cdef int i, ax, last_ax, ndim
cdef Py_ssize_t total_size
cdef shape_t vecshape, newshape, newstrides
cdef vector.vector[int] array_indexes, axes
cdef vector.vector[int] strides_indexes
cdef ParameterInfo p
cdef _ndarray_base arr
ndim = shape.size()
array_indexes.reserve(len(args))
strides_indexes.reserve(len(args))
for i in range(len(args)):
p = params[i]
if p.raw:
continue
a = args[i]
if isinstance(a, _ndarray_base):
array_indexes.push_back(i)
arr = a
if not arr._c_contiguous:
if ndim == 2: # short cut
return shape
strides_indexes.push_back(i)
if array_indexes.size() == 0:
return shape
if strides_indexes.size() == 0:
# The input arrays are all c_contiguous
i = array_indexes[0]
arr = args[i]
total_size = arr.size
newshape.assign(<Py_ssize_t>1, total_size)
newstrides.resize(1)
for i in array_indexes:
arr = args[i]
newstrides[0] = arr.dtype.itemsize
# TODO(niboshi): Confirm update_x_contiguity flags
args[i] = arr._view(
type(arr), newshape, newstrides, False, True, arr)
return newshape
axes.reserve(ndim)
vecshape.reserve(ndim)
for ax in range(ndim):
vecshape.push_back(shape[ax])
last_ax = -1
for ax in range(ndim):
if vecshape[ax] == 1:
continue
if last_ax < 0:
last_ax = ax
continue
for i in strides_indexes:
arr = args[i]
if arr._strides[ax] * vecshape[ax] != arr._strides[last_ax]:
axes.push_back(last_ax)
break
else:
vecshape[ax] *= vecshape[last_ax]
last_ax = ax
if last_ax >= 0:
axes.push_back(last_ax)
if <int>axes.size() == ndim:
return shape
newshape.reserve(axes.size())
newstrides.reserve(axes.size())
for ax in axes:
newshape.push_back(vecshape[ax])
for i in array_indexes:
arr = args[i]
newstrides.clear()
for ax in axes:
newstrides.push_back(arr._strides[ax])
# TODO(niboshi): Confirm update_x_contiguity flags
args[i] = arr._view(type(arr), newshape, newstrides, False, True, arr)
return newshape
cdef class ParameterInfo:
def __init__(self, str param, bint is_const):
self.name = None
self.dtype = None
self.ctype = None
self.raw = False
self.is_const = is_const
s = tuple([i for i in param.split() if len(i) != 0])
if len(s) < 2:
raise Exception('Syntax error: %s' % param)
t, self.name = s[-2:]
if t == 'CIndexer':
pass
elif len(t) == 1:
self.ctype = t
else:
dtype = get_dtype(t)
self.dtype = dtype.type
if dtype.name != t:
raise ValueError('Wrong type %s' % t)
self.ctype = _get_typename(self.dtype)
for i in s[:-2]:
if i == 'raw':
self.raw = True
elif i == '_non_const':
self.is_const = False
else:
raise Exception('Unknown keyword "%s"' % i)
def __hash__(self):
return hash((
self.name, self.dtype, self.ctype, self.raw, self.is_const))
def __eq__(self, other):
cdef ParameterInfo oth
if not isinstance(other, ParameterInfo):
return False
oth = other
return (
self.name == oth.name
and self.dtype == oth.dtype
and self.ctype == oth.ctype
and self.raw == oth.raw
and self.is_const == oth.is_const)
def __repr__(self):
return '<ParameterInfo({})>'.format(
' '.join([
'name={!r}'.format(self.name),
'dtype={!r}'.format(self.dtype),
'ctype={!r}'.format(self.ctype),
'raw={!r}'.format(self.raw),
'is_const={!r}'.format(self.is_const),
]))
@_util.memoize()
def _get_param_info(str s, is_const):
if len(s) == 0:
return ()
return tuple([ParameterInfo(i, is_const) for i in s.strip().split(',')])
@_util.memoize()
def _decide_params_type(in_params, out_params, in_args_dtype, out_args_dtype):
return _decide_params_type_core(in_params, out_params, in_args_dtype,
out_args_dtype)
cdef class _TypeMap:
def __init__(self, pairs):
self._pairs = pairs
def __hash__(self):
return hash(self._pairs)
def __eq__(self, other):
if not isinstance(other, _TypeMap):
return False
return self._pairs == (<_TypeMap>other)._pairs
def __str__(self):
return '<_TypeMap {}>'.format(self._pairs)
cdef str get_typedef_code(self):
# Returns a code fragment of typedef statements used as preamble.
return ''.join([
'typedef %s %s;\n' % (_get_typename(ctype2), ctype1)
for ctype1, ctype2 in self._pairs])
cdef tuple _decide_params_type_core(
tuple in_params, tuple out_params, tuple in_args_dtype,
tuple out_args_dtype):
type_dict = {}
if out_args_dtype:
assert len(out_params) == len(out_args_dtype)
for p, a in zip(out_params, out_args_dtype):
if a is None:
raise TypeError('Output arguments must be cupy.ndarray')
if p.dtype is not None:
if get_dtype(a) != get_dtype(p.dtype):
raise TypeError(
'Type is mismatched. %s %s %s' % (p.name, a, p.dtype))
elif p.ctype in type_dict:
t = type_dict[p.ctype]
if get_dtype(t) != get_dtype(a):
raise TypeError(
'Type is mismatched. %s %s %s %s' % (
p.name, a, t, p.ctype))
else:
type_dict[p.ctype] = a
assert len(in_params) == len(in_args_dtype)
unknown_ctype = [] # TODO(leofang): remove this as it's unused?
for p, a in zip(in_params, in_args_dtype):
if a is None:
if p.dtype is None:
unknown_ctype.append(p.ctype)
else:
if p.dtype is not None:
if numpy.dtype(a) != numpy.dtype(p.dtype):
raise TypeError(
'Type is mismatched. %s %s %s' % (p.name, a, p.dtype))
elif p.ctype in type_dict:
t = type_dict[p.ctype]
if numpy.dtype(t) != numpy.dtype(a):
raise TypeError(
'Type is mismatched. %s %s %s %s' % (
p.name, a, t, p.ctype))
else:
type_dict[p.ctype] = a
in_types = tuple([type_dict[p.ctype] if p.dtype is None else p.dtype
for p in in_params])
out_types = tuple([type_dict[p.ctype] if p.dtype is None else p.dtype
for p in out_params])
type_map = _TypeMap(tuple(sorted(type_dict.items())))
return in_types, out_types, type_map
cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape):
# `shape` is an output argument
cdef Py_ssize_t i
cdef ParameterInfo p
cdef bint any_nonraw_array = False
# Collect non-raw arrays
value = []
for i, a in enumerate(args):
p = params[i]
if not p.raw and isinstance(a, _ndarray_base):
# Non-raw array
any_nonraw_array = True
value.append(a)
else:
value.append(None)
if use_size:
if any_nonraw_array:
raise ValueError('Specified \'size\' can be used only '
'if all of the ndarray are \'raw\'.')
else:
if not any_nonraw_array:
raise ValueError('Loop size is undecided.')
# Perform broadcast.
# Note that arrays in `value` are replaced with broadcasted ones.
internal._broadcast_core(value, shape)
# Restore raw arrays and scalars from the original list.
for i, a in enumerate(value):
if a is None:
value[i] = args[i]
return value
cdef _numpy_can_cast = numpy.can_cast
cdef list _get_out_args_from_optionals(
subtype, list out_args, tuple out_types, const shape_t& out_shape, casting,
obj
):
cdef _ndarray_base arr
while len(out_args) < len(out_types):
out_args.append(None)
for i, a in enumerate(out_args):
if a is None:
out_args[i] = _ndarray_init(
subtype, out_shape, out_types[i], obj)
continue
if not isinstance(a, _ndarray_base):
raise TypeError(
'Output arguments type must be cupy.ndarray')
arr = a
if not internal.vector_equal(arr._shape, out_shape):
raise ValueError('Out shape is mismatched')
out_type = get_dtype(out_types[i])
_raise_if_invalid_cast(out_type, arr.dtype, casting, "output operand")
return out_args
cdef _copy_in_args_if_needed(list in_args, list out_args):
# `in_args` is an input and output argument
cdef _ndarray_base inp, out
for i in range(len(in_args)):
a = in_args[i]
if isinstance(a, _ndarray_base):
inp = a
for out in out_args:
if inp is not out and may_share_bounds(inp, out):
in_args[i] = inp.copy()
break
cdef list _get_out_args_with_params(
list out_args, tuple out_types, const shape_t& out_shape,
tuple out_params, bint is_size_specified):
cdef ParameterInfo p
cdef _ndarray_base arr
if not out_args:
for p in out_params:
if p.raw and not is_size_specified:
raise ValueError('Output array size is Undecided')
return [_ndarray_init(
cupy.ndarray, out_shape, t, None) for t in out_types]
for i, p in enumerate(out_params):
a = out_args[i]
if not isinstance(a, _ndarray_base):
raise TypeError(
'Output arguments type must be cupy.ndarray')
arr = a
if not p.raw and not internal.vector_equal(arr._shape, out_shape):
raise ValueError('Out shape is mismatched')
return out_args
@_util.memoize()
def _get_elementwise_kernel_code(
tuple arginfos, _TypeMap type_map,
tuple params, str operation, str name,
str preamble, str loop_prep='', str after_loop='', tuple options=()):
cdef _ArgInfo arginfo
op = []
for p, arginfo in zip(params, arginfos):
if arginfo.is_ndarray() and not p.raw:
if p.is_const:
fmt = 'const {t} &{n} = _raw_{n}[_ind.get()];'
else:
fmt = '{t} &{n} = _raw_{n}[_ind.get()];'
op.append(fmt.format(t=p.ctype, n=p.name))
op.append(operation)
operation = '\n'.join(op)
return _get_simple_elementwise_kernel_code(
params, arginfos, operation, name, type_map,
preamble, loop_prep, after_loop)
@_util.memoize(for_each_device=True)
def _get_elementwise_kernel(
tuple arginfos, _TypeMap type_map,
tuple params, str operation, str name,
str preamble, str loop_prep='', str after_loop='', tuple options=()):
cdef str code = _get_elementwise_kernel_code(
arginfos, type_map, params, operation, name, preamble, loop_prep,
after_loop
)
return _get_simple_elementwise_kernel_from_code(name, code, options)
cdef class ElementwiseKernel:
"""User-defined elementwise kernel.
This class can be used to define an elementwise kernel with or without
broadcasting.
The kernel is compiled at an invocation of the
:meth:`~ElementwiseKernel.__call__` method,
which is cached for each device.
The compiled binary is also cached into a file under the
``$HOME/.cupy/kernel_cache/`` directory with a hashed file name. The cached
binary is reused by other processes.
Args:
in_params (str): Input argument list.
out_params (str): Output argument list.
operation (str): The body in the loop written in CUDA-C/C++.
name (str): Name of the kernel function. It should be set for
readability of the performance profiling.
reduce_dims (bool): If ``False``, the shapes of array arguments are
kept within the kernel invocation. The shapes are reduced
(i.e., the arrays are reshaped without copy to the minimum
dimension) by default. It may make the kernel fast by reducing the
index calculations.
options (tuple): Compile options passed to NVRTC. For details, see
https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.
preamble (str): Fragment of the CUDA-C/C++ code that is inserted at the
top of the cu file.
no_return (bool): If ``True``, __call__ returns ``None``.
return_tuple (bool): If ``True``, __call__ always returns tuple of
array even if single value is returned.
loop_prep (str): Fragment of the CUDA-C/C++ code that is inserted at
the top of the kernel function definition and above the ``for``
loop.
after_loop (str): Fragment of the CUDA-C/C++ code that is inserted at
the bottom of the kernel function definition.
"""
cdef:
readonly tuple in_params
readonly tuple out_params
readonly Py_ssize_t nin
readonly Py_ssize_t nout
readonly Py_ssize_t nargs
readonly tuple params
readonly object operation
readonly str name
readonly str __name__
readonly bint reduce_dims
readonly object preamble
readonly bint no_return
readonly bint return_tuple
readonly dict kwargs
readonly dict _params_type_memo
readonly dict _elementwise_kernel_memo
readonly dict _cached_codes
def __init__(self, in_params, out_params, operation,
name='kernel', reduce_dims=True, preamble='',
no_return=False, return_tuple=False, **kwargs):
if not compiler.is_valid_kernel_name(name):
raise ValueError(
'Invalid kernel name: "%s"' % name)
self.in_params = _get_param_info(in_params, True)
self.out_params = _get_param_info(out_params, False)
self.nin = len(self.in_params)
self.nout = len(self.out_params)
self.nargs = self.nin + self.nout
param_rest = _get_param_info('CIndexer _ind', False)
self.params = self.in_params + self.out_params + param_rest
self.operation = operation
self.name = name
self.reduce_dims = reduce_dims
self.preamble = preamble
self.no_return = no_return
self.return_tuple = return_tuple
self.kwargs = kwargs
self._params_type_memo = {}
self._cached_codes = {}
names = [p.name for p in self.in_params + self.out_params]
if 'i' in names:
raise ValueError('Can not use \'i\' as a parameter name')
self._elementwise_kernel_memo = {}
# This is for profiling mechanisms to auto infer a name
self.__name__ = name
def __call__(self, *args, **kwargs):
"""Compiles and invokes the elementwise kernel.
The compilation runs only if the kernel is not cached. Note that the
kernels with different argument dtypes or dimensions are not
compatible. It means that single ElementwiseKernel object may be
compiled into multiple kernel binaries.
Args:
args: Arguments of the kernel.
size (int): Range size of the indices. By default, the range size
is automatically determined from the result of broadcasting.
This parameter must be specified if and only if all ndarrays
are `raw` and the range size cannot be determined
automatically.
block_size (int): Number of threads per block. By default, the
value is set to 128.
Returns:
If ``no_return`` has not set, arrays are returned according to the
``out_params`` argument of the ``__init__`` method.
If ``no_return`` has set, ``None`` is returned.
"""
cdef function.Function kern
cdef Py_ssize_t size, i
cdef list in_args, out_args
cdef tuple in_types, out_types
cdef shape_t shape
size = kwargs.pop('size', -1)
stream = kwargs.pop('stream', None)
block_size = kwargs.pop('block_size', 128)
if len(kwargs):
raise TypeError('Wrong arguments %s' % kwargs)
if block_size <= 0:
raise ValueError('block_size must be greater than zero')
n_args = len(args)
if n_args != self.nin and n_args != self.nargs:
raise TypeError(
'Wrong number of arguments for {!r}. '
'It must be either {} or {} (with outputs), '
'but given {}.'.format(
self.name, self.nin, self.nargs, n_args))
for arg in args:
if hasattr(arg, '__cupy_override_elementwise_kernel__'):
return arg.__cupy_override_elementwise_kernel__(
self, *args, **kwargs)
dev_id = device.get_device_id()
arg_list = _preprocess_args(dev_id, args, True)
out_args = arg_list[self.nin:]
# _broadcast updates shape
in_args = _broadcast(
arg_list, self.params, size != -1, shape)[:self.nin]
in_ndarray_types = []
for a in in_args:
if isinstance(a, _ndarray_base):
t = a.dtype.type
elif isinstance(a, texture.TextureObject):
t = 'cudaTextureObject_t'
else:
t = None
in_ndarray_types.append(t)
in_ndarray_types = tuple(in_ndarray_types)
out_ndarray_types = tuple([a.dtype.type for a in out_args])
in_types, out_types, type_map = self._decide_params_type(
in_ndarray_types, out_ndarray_types)
is_size_specified = False
if size != -1:
shape.assign(1, size)
is_size_specified = True
out_args = _get_out_args_with_params(
out_args, out_types, shape, self.out_params, is_size_specified)
if self.no_return:
ret = None
elif not self.return_tuple and self.nout == 1:
ret = out_args[0]
else:
ret = tuple(out_args)
if _contains_zero(shape):
return ret
for i, x in enumerate(in_args):
if type(x) is _scalar.CScalar:
(<_scalar.CScalar>x).apply_dtype(in_types[i])
inout_args = in_args + out_args
if self.reduce_dims:
shape = _reduce_dims(inout_args, self.params, shape)
indexer = _carray._indexer_init(shape)
inout_args.append(indexer)
arginfos = _get_arginfos(inout_args)
kern = self._get_elementwise_kernel(dev_id, arginfos, type_map)
kern.linear_launch(indexer.size, inout_args, shared_mem=0,
block_max_size=block_size, stream=stream)
return ret
cpdef tuple _decide_params_type(
self, tuple in_args_dtype, tuple out_args_dtype):
key = (in_args_dtype, out_args_dtype)
ret = self._params_type_memo.get(key, None)
if ret is not None:
return ret
ret = _decide_params_type_core(
self.in_params, self.out_params, in_args_dtype, out_args_dtype)
self._params_type_memo[key] = ret
return ret
cpdef function.Function _get_elementwise_kernel(
self, int dev_id, tuple arginfos, _TypeMap type_map):
key = (
dev_id,
arginfos,
type_map)
kern = self._elementwise_kernel_memo.get(key, None)
if kern is not None:
return kern
kern = _get_elementwise_kernel(
arginfos, type_map, self.params, self.operation,
self.name, self.preamble, **self.kwargs)
# Store the compiled kernel in the cache.
# Potentially overwrite a duplicate cache entry because
# _get_elementwise_kernel() may include IO wait.
in_types = []
for x in arginfos:
if x.type is cupy.ndarray:
in_types.append(cupy.dtype(x.dtype).char)
in_types = tuple(in_types)
if in_types not in self._cached_codes:
code = _get_elementwise_kernel_code(
arginfos, type_map, self.params, self.operation,
self.name, self.preamble, **self.kwargs)
self._cached_codes[in_types] = code
self._elementwise_kernel_memo[key] = kern
return kern
@property
def cached_codes(self):
"""Returns a dict that has input types as keys and codes values.
This proprety method is for debugging purpose.
The return value is not guaranteed to keep backward compatibility.
"""
if len(self._cached_codes) == 0:
warnings.warn(
'No codes are cached because compilation is deferred until '
'the first function call.')
return dict([(k, v) for k, v in self._cached_codes.items()])
@property
def cached_code(self):
"""Returns `next(iter(self.cached_codes.values()))`.
This proprety method is for debugging purpose.
The return value is not guaranteed to keep backward compatibility.
"""
codes = self._cached_codes
if len(codes) > 1:
warnings.warn(
'The input types of the kernel could not be inferred. '
'Please use `.cached_codes` instead.')
return next(iter(codes.values()))
cdef str fix_cast_expr(src_type, dst_type, str expr):
src_kind = get_dtype(src_type).kind
dst_kind = get_dtype(dst_type).kind
if src_kind == dst_kind:
return expr
if src_kind == 'b':
# HIP has an issue with bool conversions detailed below
if runtime._is_hip_environment:
return f'_hip_bool_cast({expr})'
else:
return f'({expr}) ? 1 : 0'
if src_kind == 'c':
if dst_kind == 'b':
return f'({expr}) != {_scalar.get_typename(src_type)}()'
else: # dst_kind in 'iuf' (int, uint, float)
return f'({expr}).real()'
return expr
cdef function.Function _get_ufunc_kernel(
tuple in_types, tuple out_types, routine, tuple arginfos,
bint has_where, params,
name, preamble, loop_prep):
cdef _ArgInfo arginfo
cdef str str_type, str_var
offset_where = len(in_types)
offset_out = offset_where
if has_where:
offset_out += 1
types = []
op = []
if has_where:
arginfo = arginfos[offset_where]
if arginfo.is_ndarray():
op.append('if(!_raw__where[_ind.get()]) continue;')
else:
op.append('if(!_where) continue;')
for i, x in enumerate(in_types):
str_var = 'in%d' % i
str_type = str_var + '_type'
types.append((str_type, x))
arginfo = arginfos[i]
if arginfo.is_ndarray():
op.append('const {} {}({});'.format(
str_type,
str_var,
fix_cast_expr(arginfo.dtype, x, f'_raw_{str_var}[_ind.get()]')
))
out_op = []
for i, x in enumerate(out_types):
str_var = 'out%d' % i
str_type = str_var + '_type'
types.append((str_type, x))
arginfo = arginfos[i + offset_out]
op.append(f'{str_type} {str_var};')
out_op.append('{} = {};'.format(
f'_raw_{str_var}[_ind.get()]',
fix_cast_expr(x, arginfo.dtype, str_var)
))
type_map = _TypeMap(tuple(types))
op.append(routine)
op.append(';')
op.extend(out_op)
operation = '\n'.join(op)
# HIP/ROCm 4.3 has an issue with ifs and ternary operators
#
# int bool(int x) {
# if (x != 0) return 1;
# return 0;
# }
#
# bool(5) == 1; //false
# bool(5) == 5; //true
#
# also it simplifies (a ? 1 : 0) directly to a, and yields
# an incorrect value
if runtime._is_hip_environment:
preamble += """
__device__ int _hip_bool_cast(long long int x) {
volatile int a = 1;
if (x == 0) a = 0;
return a;
}
"""
return _get_simple_elementwise_kernel(
params, arginfos, operation, name, type_map, preamble,
loop_prep=loop_prep)
cdef inline bint _check_should_use_min_scalar(list in_args) except? -1:
cdef int kind, max_array_kind, max_scalar_kind
cdef bint all_scalars
all_scalars = True
max_array_kind = -1
max_scalar_kind = -1
for i in in_args:
kind = _get_kind_score(ord(i.dtype.kind))
if isinstance(i, _ndarray_base):
all_scalars = False
max_array_kind = max(max_array_kind, kind)
else:
max_scalar_kind = max(max_scalar_kind, kind)
return (max_scalar_kind != -1 and
not all_scalars and
max_array_kind >= max_scalar_kind)
cdef dict _mst_unsigned_to_signed = {
i: (numpy.iinfo(j).max, (i, j))
for i, j in [(numpy.dtype(i).type, numpy.dtype(i.lower()).type)
for i in "BHILQ"]}
cdef _numpy_min_scalar_type = numpy.min_scalar_type
cdef _min_scalar_type(x):
# A non-negative integer may have two locally minimum scalar
# types: signed/unsigned integer.
# Return both for can_cast, while numpy.min_scalar_type only returns
# the unsigned type.
t = _numpy_min_scalar_type(x)
dt = t.type
if t.kind == 'u':
m, dt2 = <tuple>_mst_unsigned_to_signed[dt]
if x <= m:
return dt2
return dt
cdef class ufunc:
"""Universal function.
Attributes:
~ufunc.name (str): The name of the universal function.
~ufunc.nin (int): Number of input arguments.
~ufunc.nout (int): Number of output arguments.
~ufunc.nargs (int): Number of all arguments.
"""
cdef:
readonly Py_ssize_t nin
readonly Py_ssize_t nout
readonly Py_ssize_t nargs
readonly object name
readonly _Ops _ops # normal routines
# routines based on explicitly given output dtype
readonly _Ops _out_ops
readonly object _preamble
readonly object _loop_prep
readonly object _default_casting
readonly object _cutensor_op
readonly int _cutensor_alpha
readonly int _cutensor_gamma
readonly str _scatter_op
readonly tuple _params
readonly tuple _params_with_where
readonly dict _routine_cache
readonly dict _kernel_memo
readonly object __doc__
readonly object __name__
readonly object __module__
def __init__(
self, name, nin, nout, _Ops ops, preamble='', loop_prep='', doc='',
default_casting=None, *, _Ops out_ops=None, cutensor_op=None,
scatter_op=None):
self.name = name
self.__name__ = name
self.nin = nin
self.nout = nout
self.nargs = nin + nout
self._ops = ops
self._out_ops = out_ops
self._preamble = preamble
self._loop_prep = loop_prep
self.__doc__ = doc
if default_casting is None:
self._default_casting = 'same_kind'
else:
self._default_casting = default_casting
if cutensor_op is not None and cuda_cutensor is not None:
self._cutensor_op, self._cutensor_alpha, self._cutensor_gamma = (
getattr(cuda_cutensor, cutensor_op[0]),
cutensor_op[1], cutensor_op[2])
self._scatter_op = scatter_op
_in_params = tuple(
ParameterInfo('T in%d' % i, True)
for i in range(nin))
_out_params = tuple(
ParameterInfo('T out%d' % i, False)
for i in range(nout))
_other_params = (
ParameterInfo('CIndexer _ind', False),)
self._params = _in_params + _out_params + _other_params
self._params_with_where = (
_in_params + (ParameterInfo('T _where', False),)
+ _out_params + _other_params)
self._routine_cache = {}
self._kernel_memo = {}
def __repr__(self):
return '<ufunc \'%s\'>' % self.name
@property
def types(self):
"""A list of type signatures.
Each type signature is represented by type character codes of inputs
and outputs separated by '->'.
"""
types = []
for op in self._ops.ops:
in_str = ''.join([<str>get_dtype(t).char for t in op.in_types])
out_str = ''.join([<str>get_dtype(t).char for t in op.out_types])
types.append('%s->%s' % (in_str, out_str))
return types
def __call__(self, *args, **kwargs):
"""Applies the universal function to arguments elementwise.
Args:
args: Input arguments. Each of them can be a :class:`cupy.ndarray`
object or a scalar. The output arguments can be omitted or be
specified by the ``out`` argument.
out (cupy.ndarray): Output array. It outputs to new arrays
default.
dtype: Data type specifier.
Returns:
Output array or a tuple of output arrays.
"""
for arg in args:
if hasattr(arg, '__cupy_override_elementwise_kernel__'):
return arg.__cupy_override_elementwise_kernel__(
self, *args, **kwargs)
if _fusion_thread_local.is_fusing():
return _fusion_thread_local.call_ufunc(self, *args, **kwargs)
cdef function.Function kern
cdef list broad_values
cdef shape_t shape
out = kwargs.pop('out', None)
where = kwargs.pop('_where', None)
cdef bint has_where = where is not None
dtype = kwargs.pop('dtype', None)
# Note default behavior of casting is 'same_kind' on numpy>=1.10
casting = kwargs.pop('casting', self._default_casting)
if dtype is not None:
dtype = get_dtype(dtype).type
if kwargs:
raise TypeError('Wrong arguments %s' % kwargs)
n_args = len(args)
if not (self.nin <= n_args <= self.nargs):
# TODO(kataoka): Fix error message for nout >= 2 (e.g. divmod)
raise TypeError(
'Wrong number of arguments for {!r}. '
'It must be either {} or {} (with outputs), '
'but given {}.'.format(
self.name, self.nin, self.nargs, n_args))
# parse inputs (positional) and outputs (positional or keyword)
in_args = args[:self.nin]
out_args = args[self.nin:]
if out is not None:
if out_args:
raise ValueError('Cannot specify \'out\' as both '
'a positional and keyword argument')
if isinstance(out, tuple):
if len(out) != self.nout:
raise ValueError(
"The 'out' tuple must have exactly one entry per "
"ufunc output")
out_args = out
else:
if 1 != self.nout:
raise ValueError("'out' must be a tuple of arrays")
out_args = out,
dev_id = device.get_device_id()
in_args = _preprocess_args(dev_id, in_args, False)
out_args = _preprocess_optional_args(dev_id, out_args, False)
given_out_args = [o for o in out_args if o is not None]
# TODO(kataoka): Typecheck `in_args` w.r.t. `casting` (before
# broadcast).
if has_where:
where_args = _preprocess_args(dev_id, (where,), False)
x = where_args[0]
if isinstance(x, _ndarray_base):
# NumPy seems using casting=safe here
if x.dtype != bool:
raise TypeError(
f'Cannot cast array data from {x.dtype!r} to '
f'{get_dtype(bool)!r} according to the rule \'safe\'')
else:
# NumPy does not seem raising TypeError.
# CuPy does not have to support `where=object()` etc. and
# `_preprocess_args` rejects it anyway.
where_args[0] = _scalar.CScalar.from_numpy_scalar_with_dtype(
x, numpy.bool_)
else:
where_args = []
# _copy_in_args_if_needed updates in_args
_copy_in_args_if_needed(in_args, given_out_args)
_copy_in_args_if_needed(where_args, given_out_args)
broad_values = in_args + where_args + given_out_args
# _broadcast updates shape
internal._broadcast_core(broad_values, shape)
if (self._cutensor_op is not None
and _accelerator.ACCELERATOR_CUTENSOR in
_accelerator._elementwise_accelerators):
if (self.nin == 2 and self.nout == 1 and
isinstance(in_args[0], _ndarray_base) and
isinstance(in_args[1], _ndarray_base)):
import cupyx.cutensor
ret = cupyx.cutensor._try_elementwise_binary_routine(
in_args[0], in_args[1], dtype,
out_args[0] if len(out_args) == 1 else None,
self._cutensor_op,
self._cutensor_alpha,
self._cutensor_gamma,
)
if ret is not None:
return ret
op = self._ops.guess_routine(
self.name, self._routine_cache, in_args, dtype, self._out_ops)
# Determine a template object from which we initialize the output when
# inputs have subclass instances
def issubclass1(cls, classinfo):
return issubclass(cls, classinfo) and cls is not classinfo
subtype = cupy.ndarray
template = None
for in_arg in in_args:
in_arg_type = type(in_arg)
if issubclass1(in_arg_type, cupy.ndarray):
subtype = in_arg_type
template = in_arg
break
out_args = _get_out_args_from_optionals(
subtype, out_args, op.out_types, shape, casting, template)
if self.nout == 1:
ret = out_args[0]
else:
ret = tuple(out_args)
if _contains_zero(shape):
return ret
inout_args = []
for i, t in enumerate(op.in_types):
x = broad_values[i]
inout_args.append(
x if isinstance(x, _ndarray_base) else
_scalar.CScalar.from_numpy_scalar_with_dtype(x, t))
if has_where:
x = broad_values[self.nin]
inout_args.append(x)
inout_args.extend(out_args)
shape = _reduce_dims(inout_args, self._params, shape)
indexer = _carray._indexer_init(shape)
inout_args.append(indexer)
arginfos = _get_arginfos(inout_args)
kern = self._get_ufunc_kernel(dev_id, op, arginfos, has_where)
kern.linear_launch(indexer.size, inout_args)
return ret
cdef str _get_name_with_type(self, tuple arginfos, bint has_where):
cdef str name = self.name
if has_where:
name += '_where'
cdef _ArgInfo arginfo
inout_type_words = []
for arginfo in arginfos:
dtype = str(numpy.dtype(arginfo.dtype))
if arginfo.is_ndarray():
inout_type_words.append(dtype)
elif arginfo.is_scalar():
inout_type_words.append(dtype.rstrip('0123456789'))
return '{}__{}'.format(name, '_'.join(inout_type_words))
cdef function.Function _get_ufunc_kernel(
self, int dev_id, _Op op, tuple arginfos, bint has_where):
cdef function.Function kern
key = (dev_id, op, arginfos, has_where)
kern = self._kernel_memo.get(key, None)
if kern is None:
name = self._get_name_with_type(arginfos, has_where)
params = self._params_with_where if has_where else self._params
kern = _get_ufunc_kernel(
op.in_types, op.out_types, op.routine, arginfos, has_where,
params, name, self._preamble, self._loop_prep)
self._kernel_memo[key] = kern
return kern
def outer(self, A, B, **kwargs):
"""Apply the ufunc operation to all pairs of elements in A and B.
.. seealso::
:meth:`numpy.ufunc.outer`
"""
A = core.array(A)
B = core.array(B)
ndim_a = A.ndim
ndim_b = B.ndim
A = A.reshape(A.shape + (1,) * ndim_b)
B = B.reshape((1,) * ndim_a + B.shape)
return self(A, B, **kwargs)
def at(self, a, indices, b=None):
"""Apply in place operation on the operand ``a`` for elements
specified by ``indices``.
.. seealso::
:meth:`numpy.ufunc.at`
"""
if self._scatter_op is not None:
a._scatter_op(indices, b, self._scatter_op)
else:
raise NotImplementedError(f'`{self.name}.at` is not supported yet')
def reduce(self, array, axis=0, dtype=None, out=None, keepdims=False):
"""Reduce ``array`` applying ufunc.
.. seealso::
:meth:`numpy.ufunc.reduce`
"""
if self.name == 'cupy_add':
return array.sum(axis, dtype, out, keepdims)
if self.name == 'cupy_multiply':
return array.prod(axis, dtype, out, keepdims)
raise NotImplementedError(f'`{self.name}.reduce` is not supported yet')
def accumulate(self, array, axis=0, dtype=None, out=None):
"""Accumulate ``array`` applying ufunc.
.. seealso::
:meth:`numpy.ufunc.accumulate`
"""
if self.name == 'cupy_add':
return array.cumsum(axis, dtype, out)
if self.name == 'cupy_multiply':
return array.cumprod(axis, dtype, out)
raise NotImplementedError(
f'`{self.name}.accumulate` is not supported yet')
def reduceat(self, array, indices, axis=0, dtype=None, out=None):
"""Reduce ``array`` applying ufunc with indices.
.. seealso::
:meth:`numpy.ufunc.reduceat`
"""
if self.name == 'cupy_add':
return array._add_reduceat(indices, axis, dtype, out)
raise NotImplementedError(
f'`{self.name}.reduceat` is not supported yet')
cdef class _Op:
def __init__(
self, tuple in_types, tuple out_types, object routine,
object error_func):
if error_func is None:
assert routine is not None
else:
assert callable(error_func)
self.in_types = in_types
self.out_types = out_types
self.nin = len(in_types)
self.nout = len(out_types)
self.routine = routine
self.error_func = error_func
@staticmethod
cdef _Op _from_type_and_routine_or_error_func(
str typ, object routine, object error_func):
# TODO(niboshi): Write type mapping specification.
types = typ.split('->')
if len(types) == 1:
in_types = out_types = tuple(types)
else:
in_types, out_types = map(tuple, types)
in_types = tuple([get_dtype(t).type for t in in_types])
out_types = tuple([get_dtype(t).type for t in out_types])
return _Op(in_types, out_types, routine, error_func)
@staticmethod
cdef _Op from_type_and_routine(str typ, routine):
return _Op._from_type_and_routine_or_error_func(typ, routine, None)
@staticmethod
cdef _Op from_type_and_error_func(str typ, error_func):
return _Op._from_type_and_routine_or_error_func(typ, None, error_func)
cdef check_valid(self):
if self.error_func is not None:
self.error_func()
cpdef tuple get_in_dtypes(self):
return tuple([get_dtype(t) for t in self.in_types])
cpdef tuple get_out_dtypes(self):
return tuple([get_dtype(t) for t in self.out_types])
cdef class _Ops:
def __init__(self, tuple ops):
assert len(ops) > 0
nin = ops[0].nin
nout = ops[0].nout
assert all(op.nin == nin for op in ops)
assert all(op.nout == nout for op in ops)
self.ops = ops
self.nin = nin
self.nout = nout
@staticmethod
cdef _Ops from_tuples(object ops, routine):
ops_ = []
for t in ops:
if isinstance(t, tuple):
typ, rt = t
if isinstance(rt, tuple):
rt = tuple([r1 or r2 for r1, r2 in zip(rt, routine)])
elif not isinstance(rt, str):
assert callable(rt)
ops_.append(_Op.from_type_and_error_func(typ, rt))
continue
else:
assert isinstance(t, str)
typ, rt = t, routine
ops_.append(_Op.from_type_and_routine(typ, rt))
return _Ops(tuple(ops_))
cpdef _Op guess_routine(
self, str name, dict cache, list in_args, dtype, _Ops out_ops):
cdef _Ops ops_
if dtype is None:
use_raw_value = _check_should_use_min_scalar(in_args)
if use_raw_value:
in_types = tuple([
a.dtype.type if isinstance(a, _ndarray_base)
else _min_scalar_type(a)
for a in in_args])
else:
in_types = tuple([a.dtype.type for a in in_args])
op = cache.get(in_types, ())
if op is ():
op = self._guess_routine_from_in_types(in_types)
cache[in_types] = op
else:
op = cache.get(dtype, ())
if op is ():
ops_ = out_ops or self
op = ops_._guess_routine_from_dtype(dtype)
cache[dtype] = op
if op is not None:
# raise TypeError if the type combination is disallowed
(<_Op>op).check_valid()
return op
if dtype is None:
dtype = tuple([a.dtype.type for a in in_args])
raise TypeError('Wrong type (%s) of arguments for %s' %
(dtype, name))
cpdef _Op _guess_routine_from_in_types(
self, tuple in_types, object can_cast=_numpy_can_cast):
cdef _Op op
cdef tuple op_types
cdef Py_ssize_t n = len(in_types)
cdef Py_ssize_t i
for op in self.ops:
op_types = op.in_types
for i in range(n):
it = in_types[i]
ot = op_types[i]
if isinstance(it, tuple):
if not can_cast(it[0], ot) and not can_cast(it[1], ot):
break
elif not can_cast(it, ot):
break
else:
return op
return None
cpdef _Op _guess_routine_from_dtype(self, object dtype):
cdef _Op op
cdef tuple op_types
for op in self.ops:
op_types = op.out_types
for t in op_types:
if t != dtype:
break
else:
return op
return None
cpdef create_ufunc(name, ops, routine=None, preamble='', doc='',
default_casting=None, loop_prep='', out_ops=None,
cutensor_op=None, scatter_op=None):
ops_ = _Ops.from_tuples(ops, routine)
_out_ops = None if out_ops is None else _Ops.from_tuples(out_ops, routine)
return ufunc(
name, ops_.nin, ops_.nout, ops_, preamble,
loop_prep, doc, default_casting=default_casting, out_ops=_out_ops,
cutensor_op=cutensor_op, scatter_op=scatter_op)
from cupy._core.core cimport _ndarray_base
from libcpp.pair cimport pair
cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array)
cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b)
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport memory
from libcpp.pair cimport pair
cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array):
cdef Py_ssize_t left = array.data.ptr
cdef Py_ssize_t right = left
cdef Py_ssize_t tmp
cdef pair[Py_ssize_t, Py_ssize_t] ret
cdef size_t i
for i in range(array._shape.size()):
# shape[i] != 0 is assumed
tmp = (array._shape[i] - 1) * array._strides[i]
if tmp > 0:
right += tmp
else:
left += tmp
ret.first = left
ret.second = right + <Py_ssize_t>array.dtype.itemsize
return ret
cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b):
cdef memory.MemoryPointer a_data = a.data
cdef memory.MemoryPointer b_data = b.data
cdef pair[Py_ssize_t, Py_ssize_t] a_range, b_range
if (a_data.device_id != b_data.device_id
or a_data.mem.ptr != b_data.mem.ptr
or a.size == 0 or b.size == 0):
return False
a_range = get_bound(a)
b_range = get_bound(b)
return a_range.first < b_range.second and b_range.first < a_range.second
cdef object _thread_local
cdef dict _contexts
cdef class _OptimizationConfig:
cdef readonly object optimize_impl
cdef readonly int max_trials
cdef readonly float timeout
cdef readonly float expected_total_time_per_trial
cdef readonly float max_total_time_per_trial
cdef class _OptimizationContext:
cdef readonly str key
cdef readonly _OptimizationConfig config
cdef readonly dict _params_map
cdef readonly bint _dirty
cpdef _OptimizationContext get_current_context()
import pickle
import threading
cdef _thread_local = threading.local()
cdef _contexts = {}
cdef class _OptimizationConfig:
def __init__(
self, optimize_impl, *,
int max_trials=100,
float timeout=1,
float expected_total_time_per_trial=100 * 1e-6,
float max_total_time_per_trial=0.1):
self.optimize_impl = optimize_impl
self.max_trials = max_trials
self.timeout = timeout
self.expected_total_time_per_trial = expected_total_time_per_trial
self.max_total_time_per_trial = max_total_time_per_trial
cdef class _OptimizationContext:
def __init__(self, str key, _OptimizationConfig config):
self.key = key
self.config = config
self._params_map = {}
self._dirty = False
def get_params(self, key):
return self._params_map.get(key)
def set_params(self, key, params):
self._params_map[key] = params
self._dirty = True
def save(self, filepath):
with open(filepath, mode='wb') as f:
pickle.dump((self.key, self._params_map), f)
self._dirty = False
def load(self, filepath):
with open(filepath, mode='rb') as f:
key, params_map = pickle.load(f)
if key != self.key:
raise ValueError(
'Optimization key mismatch {} != {}'.format(key, self.key))
self._params_map = params_map
self._dirty = False
def _is_dirty(self):
return self._dirty
cpdef _OptimizationContext get_current_context():
try:
return _thread_local.current_context
except AttributeError:
return None
def set_current_context(_OptimizationContext context):
_thread_local.current_context = context
def get_new_context(
str key, object optimize_impl, dict config_dict):
c = _contexts.get(key)
if c is None:
config = _OptimizationConfig(optimize_impl, **config_dict)
c = _OptimizationContext(key, config)
_contexts[key] = c
return c
def _clear_all_contexts_cache():
global _contexts
assert get_current_context() is None
_contexts = {}
from cupy._core._carray cimport shape_t
from cupy._core cimport _kernel
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport function
cdef Py_ssize_t _block_size
cpdef tuple _get_axis(object axis, Py_ssize_t ndim)
cpdef shape_t _get_out_shape(
const shape_t& shape, tuple reduce_axis, tuple out_axis, bint keepdims)
cdef class _AbstractReductionKernel:
cdef:
readonly str name
public str identity
readonly tuple in_params
readonly tuple out_params
readonly tuple _params
readonly str __name__
readonly dict _cached_codes
cpdef _ndarray_base _call(
self,
list in_args, list out_args,
const shape_t& a_shape, axis, dtype,
bint keepdims, bint reduce_dims, int device_id,
stream, bint try_use_cub=*, bint sort_reduce_axis=*)
cdef void _launch(
self, out_block_num, block_size, block_stride,
in_args, out_args, in_shape, out_shape, types,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params)
cdef tuple _get_expressions_and_types(
self, list in_args, list out_args, dtype)
cdef list _get_out_args(
self, list out_args, tuple out_types, const shape_t& out_shape)
cdef function.Function _get_function(
self,
tuple params, tuple arginfos, _kernel._TypeMap types,
str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
Py_ssize_t block_size)
cdef class ReductionKernel(_AbstractReductionKernel):
cdef:
readonly int nin
readonly int nout
readonly int nargs
readonly tuple params
readonly str reduce_expr
readonly str map_expr
readonly str post_map_expr
readonly object options
readonly bint reduce_dims
readonly object reduce_type
readonly str preamble
cdef shape_t _set_permuted_args(
list args, tuple axis_permutes, const shape_t& shape, tuple params)
cdef tuple _get_shape_and_strides(list in_args, list out_args)
cdef _optimizer_copy_arg(a)
cpdef create_reduction_func(
name, ops, routine=*, identity=*, preamble=*, sort_reduce_axis=*)
from cpython cimport sequence
from cupy._core cimport _carray
from cupy._core cimport _accelerator
from cupy._core._carray cimport shape_t
from cupy._core cimport _cub_reduction
from cupy._core._dtype cimport get_dtype
from cupy._core cimport _kernel
from cupy._core._kernel cimport _broadcast
from cupy._core._kernel cimport _check_peer_access
from cupy._core._kernel cimport _get_arginfos
from cupy._core._kernel cimport _get_out_args_from_optionals
from cupy._core._kernel cimport _get_out_args_with_params
from cupy._core._kernel cimport _preprocess_args
from cupy._core._kernel cimport _reduce_dims
from cupy._core._kernel cimport ParameterInfo, _ArgInfo
from cupy._core cimport _optimize_config
from cupy._core cimport _routines_manipulation as _manipulation
from cupy._core cimport _scalar
from cupy._core._scalar import get_typename as _get_typename
from cupy._core.core cimport _convert_object_with_cuda_array_interface
from cupy._core.core cimport _create_ndarray_from_shape_strides
from cupy._core.core cimport compile_with_cache
from cupy._core.core cimport _ndarray_base
from cupy._core cimport internal
from cupy.cuda cimport device
from cupy.cuda cimport function
from cupy_backends.cuda.api cimport runtime
import math
import string
import warnings
import numpy
import cupy
from cupy._core._kernel import _get_param_info
from cupy._core._kernel import _decide_params_type
from cupy._core._ufuncs import elementwise_copy
from cupy.cuda import compiler
from cupy import _util
cpdef str _create_reduction_function_code(
name, block_size, reduce_type, params, arginfos, identity,
pre_map_expr, reduce_expr, post_map_expr,
_kernel._TypeMap type_map, input_expr, output_expr, preamble, options):
# A (incomplete) list of internal variables:
# _J : the index of an element in the array
# _block_size : the number of threads in a block; should be power of 2
# _block_stride : the number of elements being processed by a block; should
# be power of 2 and <= _block_size
module_code = string.Template('''
${type_preamble}
${preamble}
#define REDUCE(a, b) (${reduce_expr})
#define POST_MAP(a) (${post_map_expr})
#define _REDUCE(_offset) if (_tid < _offset) { \
_type_reduce _a = _sdata[_tid], _b = _sdata[(_tid + _offset)]; \
_sdata[_tid] = REDUCE(_a, _b); \
}
typedef ${reduce_type} _type_reduce;
extern "C" __global__ void ${name}(${params}) {
__shared__ char _sdata_raw[${block_size} * sizeof(_type_reduce)];
_type_reduce *_sdata = reinterpret_cast<_type_reduce*>(_sdata_raw);
unsigned int _tid = threadIdx.x;
int _J_offset = _tid >> __popc(_block_stride - 1); // _tid / _block_stride
ptrdiff_t _j_offset = (ptrdiff_t)_J_offset * _out_ind.size();
int _J_stride = ${block_size} >> __popc(_block_stride - 1);
ptrdiff_t _j_stride = (ptrdiff_t)_J_stride * _out_ind.size();
for (ptrdiff_t _i_base = (ptrdiff_t)blockIdx.x * _block_stride;
_i_base < _out_ind.size();
_i_base += (ptrdiff_t)gridDim.x * _block_stride) {
_type_reduce _s = _type_reduce(${identity});
ptrdiff_t _i =
_i_base + (_tid & (_block_stride - 1)); // _tid % _block_stride
int _J = _J_offset;
for (ptrdiff_t _j = _i + _j_offset; _j < _in_ind.size();
_j += _j_stride, _J += _J_stride) {
_in_ind.set(_j);
${input_expr}
_type_reduce _a = static_cast<_type_reduce>(${pre_map_expr});
_s = REDUCE(_s, _a);
}
_sdata[_tid] = _s;
__syncthreads();
for (unsigned int _block = ${block_size} / 2;
_block >= _block_stride; _block >>= 1) {
if (_tid < _block) {
_REDUCE(_block);
}
__syncthreads();
}
if (_tid < _block_stride) {
_s = _sdata[_tid];
}
if (_tid < _block_stride && _i < _out_ind.size()) {
_out_ind.set(static_cast<ptrdiff_t>(_i));
${output_expr}
POST_MAP(_s);
}
}
}''').substitute(
name=name,
block_size=block_size,
reduce_type=reduce_type,
params=_kernel._get_kernel_params(params, arginfos),
identity=identity,
reduce_expr=reduce_expr,
pre_map_expr=pre_map_expr,
post_map_expr=post_map_expr,
type_preamble=type_map.get_typedef_code(),
input_expr=input_expr,
output_expr=output_expr,
preamble=preamble)
return module_code
cpdef function.Function _create_reduction_function_from_code(
name, code, options):
module = compile_with_cache(code, options)
return module.get_function(name)
cpdef function.Function _create_reduction_function(
name, block_size, reduce_type, params, arginfos, identity,
pre_map_expr, reduce_expr, post_map_expr,
_kernel._TypeMap type_map, input_expr, output_expr, preamble, options):
code = _create_reduction_function_code(
name, block_size, reduce_type, params, arginfos, identity,
pre_map_expr, reduce_expr, post_map_expr, type_map, input_expr,
output_expr, preamble, options
)
return _create_reduction_function_from_code(name, code, options)
cpdef tuple _get_axis(object axis, Py_ssize_t ndim):
cdef Py_ssize_t dim
if axis is None:
return (tuple(range(ndim)), ())
elif sequence.PySequence_Check(axis):
axis = tuple(axis)
else:
axis = axis,
reduce_axis = tuple(sorted(
[internal._normalize_axis_index(dim, ndim) for dim in axis]))
out_axis = tuple([dim for dim in range(ndim) if dim not in reduce_axis])
if len(reduce_axis) + len(out_axis) != ndim:
raise ValueError("duplicate value in 'axis'")
return reduce_axis, out_axis
cpdef shape_t _get_out_shape(
const shape_t& shape, tuple reduce_axis, tuple out_axis,
bint keepdims):
cdef shape_t out_shape
if keepdims:
out_shape = shape
for i in reduce_axis:
out_shape[i] = 1
else:
out_shape.reserve(len(out_axis))
for i in out_axis:
out_shape.push_back(shape[i])
return out_shape
cdef shape_t _set_permuted_args(
list args, tuple axis_permutes, const shape_t& shape, tuple params):
# This function updates `args`
cdef ParameterInfo p
cdef Py_ssize_t i, s
cdef bint need_permutation = False
cdef shape_t out_shape
for i, s in enumerate(axis_permutes):
if i != s:
need_permutation = True
break
if need_permutation:
for p in params:
if p.raw:
raise NotImplementedError('Illegal conditions')
for i, a in enumerate(args):
if isinstance(a, _ndarray_base):
args[i] = _manipulation._transpose(a, axis_permutes)
out_shape.reserve(len(axis_permutes))
for i in axis_permutes:
out_shape.push_back(shape[i])
return out_shape
else:
return shape
cdef Py_ssize_t _get_contiguous_size(
list args, tuple params, list out_shape, Py_ssize_t ndim) except -1:
'''
get contiguous size in the *output* axis (not *reduce* axis!)
'''
cdef int i, j
cdef ParameterInfo p
cdef Py_ssize_t contiguous_size, tmp_contiguous_size, itemsize
out_ndim = len(out_shape)
contiguous_size = 1
for i, a in enumerate(args):
if not isinstance(a, _ndarray_base):
continue
p = params[i]
if p.raw:
continue
tmp_contiguous_size = 1
itemsize = a.dtype.itemsize
for j in range(out_ndim):
if a._strides[ndim-j-1] != tmp_contiguous_size * itemsize:
break
tmp_contiguous_size *= out_shape[out_ndim-j-1]
contiguous_size = max(contiguous_size, tmp_contiguous_size)
return contiguous_size
cdef Py_ssize_t _default_block_size = (
256 if runtime._is_hip_environment else 512)
cdef Py_ssize_t _min_block_size_log = 5
cdef Py_ssize_t _max_block_size_log = (
8 if runtime._is_hip_environment else 9)
cpdef (Py_ssize_t, Py_ssize_t, Py_ssize_t) _get_block_specs( # NOQA
Py_ssize_t in_size, Py_ssize_t out_size,
Py_ssize_t contiguous_size,
Py_ssize_t block_size) except*:
cdef Py_ssize_t reduce_block_size, block_stride, out_block_num
if block_size == -1:
block_size = _default_block_size
reduce_block_size = max(1, in_size // out_size)
contiguous_size = min(contiguous_size, 32)
block_stride = max(contiguous_size, block_size // reduce_block_size)
block_stride = internal.clp2(block_stride // 2 + 1) # floor
out_block_num = (out_size + block_stride - 1) // block_stride
return block_size, block_stride, out_block_num
cdef tuple _sort_axis(tuple axis, tuple strides):
# Sorts axis in the decreasing order of absolute values of strides.
return tuple(sorted(axis, key=lambda i: -abs(strides[i])))
cdef tuple _get_shape_and_strides(list in_args, list out_args):
cdef list shape_and_strides = []
for x in in_args + out_args:
if isinstance(x, _ndarray_base):
shape_and_strides.append(x.shape)
shape_and_strides.append(x.strides)
else:
shape_and_strides.append(None)
shape_and_strides.append(None)
return tuple(shape_and_strides)
cdef _optimizer_copy_arg(a):
if isinstance(a, _ndarray_base):
x = _create_ndarray_from_shape_strides(
cupy.ndarray, a._shape, a._strides, a.dtype, None)
assert a.data.device_id == x.data.device_id
elementwise_copy(a, x)
return x
return a
cdef class _AbstractReductionKernel:
def __init__(
self, str name, str identity, str in_params, str out_params):
assert name is not None
assert identity is not None
assert in_params is not None
assert out_params is not None
in_params_ = _get_param_info(in_params, True)
out_params_ = _get_param_info(out_params, False)
params = (
in_params_
+ out_params_
+ _get_param_info('CIndexer _in_ind, CIndexer _out_ind', False)
+ _get_param_info('int32 _block_stride', True))
self.name = name
self.identity = identity
self.in_params = in_params_
self.out_params = out_params_
self._params = params
# This is for profiling mechanisms to auto infer a name
self.__name__ = name
self._cached_codes = {}
cpdef _ndarray_base _call(
self,
list in_args, list out_args,
const shape_t& a_shape, axis, dtype,
bint keepdims, bint reduce_dims, int device_id,
stream, bint try_use_cub=False, bint sort_reduce_axis=True):
cdef tuple reduce_axis, out_axis, axis_permutes
cdef tuple params, opt_params
cdef tuple shape_and_strides
cdef Py_ssize_t contiguous_size = -1
cdef Py_ssize_t block_size, block_stride, out_block_num = 0
cdef shape_t in_shape, out_shape
cdef _ndarray_base ret
cdef bint cub_success
if dtype is not None:
dtype = get_dtype(dtype).type
(
map_expr, reduce_expr, post_map_expr,
in_types, out_types, reduce_type,
type_map,
) = self._get_expressions_and_types(in_args, out_args, dtype)
reduce_axis, out_axis = _get_axis(axis, a_shape.size())
# When there is only one input array, sort the axes in such a way that
# contiguous (C or F) axes can be squashed in _reduce_dims() later.
# TODO(niboshi): Support (out_axis) > 1
if (len(in_args) == 1
and len(out_axis) <= 1
and not in_args[0]._c_contiguous):
strides = in_args[0].strides
if sort_reduce_axis:
reduce_axis = _sort_axis(reduce_axis, strides)
out_axis = _sort_axis(out_axis, strides)
out_shape = _get_out_shape(a_shape, reduce_axis, out_axis, keepdims)
out_args = self._get_out_args(out_args, out_types, out_shape)
ret = out_args[0]
if ret.size == 0:
return ret
if self.identity == '' and internal.is_in(a_shape, 0):
raise ValueError(('zero-size array to reduction operation'
' %s which has no identity') % self.name)
in_args = [x if isinstance(x, _ndarray_base) else
_scalar.CScalar.from_numpy_scalar_with_dtype(x, t)
for x, t in zip(in_args, in_types)]
optimize_context = _optimize_config.get_current_context()
key = ()
if optimize_context is not None:
# Calculate a key unique to the reduction setting.
shape_and_strides = _get_shape_and_strides(in_args, out_args)
key = (self.name, shape_and_strides,
in_types, out_types, reduce_type, device_id)
# Try to use CUB
for accelerator in _accelerator._reduction_accelerators:
if try_use_cub and accelerator == _accelerator.ACCELERATOR_CUB:
cub_success = _cub_reduction._try_to_call_cub_reduction(
self, in_args, out_args, a_shape, stream, optimize_context,
key, map_expr, reduce_expr, post_map_expr, reduce_type,
type_map, reduce_axis, out_axis, out_shape, ret)
if cub_success:
return ret
axis_permutes = reduce_axis + out_axis
in_shape = _set_permuted_args(
in_args, axis_permutes, a_shape, self.in_params)
if reduce_dims:
in_shape = _reduce_dims(in_args, self.in_params, in_shape)
out_shape = _reduce_dims(out_args, self.out_params, out_shape)
params = self._params
# Calculate the reduction block dimensions.
if optimize_context is None:
# Calculate manually
contiguous_size = _get_contiguous_size(
in_args, self.in_params, out_shape, in_shape.size())
block_size, block_stride, out_block_num = _get_block_specs(
internal.prod(in_shape),
internal.prod(out_shape),
contiguous_size, -1)
else:
# Optimize dynamically
key = ('simple_reduction',) + key
opt_params = optimize_context.get_params(key)
if opt_params is None:
opt_params = self._get_optimized_params(
optimize_context.config, in_args, out_args,
in_shape, out_shape, type_map, map_expr, reduce_expr,
post_map_expr, reduce_type, stream)
optimize_context.set_params(key, opt_params)
block_size, block_stride, out_block_num = opt_params
# Launch the kernel
self._launch(
out_block_num,
block_size,
block_stride,
in_args, out_args,
in_shape, out_shape,
type_map,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params)
return ret
def _get_optimized_params(
self, optimize_config, in_args, out_args, in_shape, out_shape,
type_map, map_expr, reduce_expr, post_map_expr, reduce_type,
stream):
out_size = internal.prod(out_shape)
in_args = [_optimizer_copy_arg(a) for a in in_args]
out_args = [_optimizer_copy_arg(a) for a in out_args]
contiguous_size = _get_contiguous_size(
in_args, self.in_params, out_shape, len(in_shape))
block_size, block_stride, default_out_block_num = _get_block_specs(
internal.prod(in_shape),
internal.prod(out_shape),
contiguous_size, -1)
default_block_size_log = math.floor(math.log2(block_size))
default_block_stride_log = math.floor(math.log2(block_stride))
def target_func(block_size, block_stride, out_block_num):
self._launch(
out_block_num, block_size, block_stride, in_args, out_args,
in_shape, out_shape, type_map, map_expr, reduce_expr,
post_map_expr, reduce_type, stream, self._params)
def suggest_func(trial):
block_size_log = trial.suggest_int(
'block_size_log', _min_block_size_log, _max_block_size_log)
block_size = 2 ** block_size_log
block_stride_log = trial.suggest_int(
'block_stride_log', 0, block_size_log)
block_stride = 2 ** block_stride_log
max_out_block_num = (out_size + block_stride - 1) // block_stride
out_block_num = trial.suggest_int(
'out_block_num', 1, max_out_block_num)
trial.set_user_attr('block_size', block_size)
trial.set_user_attr('block_stride', block_stride)
return block_size, block_stride, out_block_num
optimize_impl = optimize_config.optimize_impl
best = optimize_impl(
optimize_config, target_func, suggest_func,
default_best={
'block_size_log': default_block_size_log,
'block_stride_log': default_block_stride_log,
'out_block_num': default_out_block_num,
}
)
return (
best.user_attrs['block_size'],
best.user_attrs['block_stride'],
best.params['out_block_num'])
cdef inline void _launch(
self, out_block_num, block_size, block_stride,
in_args, out_args, in_shape, out_shape, type_map,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params):
cdef function.Function func
inout_args = (
in_args
+ out_args
+ [
_carray._indexer_init(in_shape),
_carray._indexer_init(out_shape),
# block_stride is passed as the last argument.
_scalar.CScalar.from_int32(block_stride),
])
# Retrieve the kernel function
func = self._get_function(
params,
_get_arginfos(inout_args),
type_map,
map_expr, reduce_expr, post_map_expr, reduce_type,
block_size)
# Launch the kernel
func.linear_launch(
out_block_num * block_size, inout_args, 0, block_size, stream)
cdef tuple _get_expressions_and_types(
self, list in_args, list out_args, dtype):
raise NotImplementedError()
cdef list _get_out_args(
self, list out_args, tuple out_types, const shape_t& out_shape):
raise NotImplementedError()
cdef function.Function _get_function(
self,
tuple params, tuple arginfos, _kernel._TypeMap type_map,
str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
Py_ssize_t block_size):
raise NotImplementedError()
@property
def cached_codes(self):
"""Returns a dict that has input types as keys and codes values.
This proprety method is for debugging purpose.
The return value is not guaranteed to keep backward compatibility.
"""
if len(self._cached_codes) == 0:
warnings.warn(
'No codes are cached because compilation is deferred until '
'the first function call or CUB is enabled.')
return dict([(k, v) for k, v in self._cached_codes.items()])
@property
def cached_code(self):
"""Returns `next(iter(self.cached_codes.values()))`.
This proprety method is for debugging purpose.
The return value is not guaranteed to keep backward compatibility.
"""
codes = self._cached_codes
if len(codes) > 1:
warnings.warn(
'The input types of the kernel could not be inferred. '
'Please use `.cached_codes` instead.')
return next(iter(codes.values()))
# -----------------------------------------------------------------------------
# create_reduction_func
# -----------------------------------------------------------------------------
cpdef _SimpleReductionKernel create_reduction_func(
name, ops, routine=None, identity=None, preamble='',
sort_reduce_axis=True):
ops = _kernel._Ops.from_tuples(ops, routine)
return _SimpleReductionKernel(
name, ops, identity, preamble, sort_reduce_axis)
cdef class _SimpleReductionKernel(_AbstractReductionKernel):
cdef:
readonly _kernel._Ops _ops
readonly str preamble
readonly int nin
readonly int nout
readonly str _input_expr
readonly str _output_expr
readonly dict _routine_cache
readonly bint _sort_reduce_axis
def __init__(
self, name, _kernel._Ops ops, identity, preamble,
sort_reduce_axis=True):
super().__init__(
name,
'' if identity is None else str(identity),
'T in0',
'T out0',
)
self._ops = ops
self.preamble = preamble
self.nin = 1
self.nout = 1
self._input_expr = 'const type_in0_raw in0 = _raw_in0[_in_ind.get()];'
self._output_expr = 'type_out0_raw &out0 = _raw_out0[_out_ind.get()];'
self._routine_cache = {}
self._sort_reduce_axis = sort_reduce_axis
def __call__(self, object a, axis=None, dtype=None, _ndarray_base out=None,
bint keepdims=False):
cdef _ndarray_base arr
if isinstance(a, _ndarray_base):
arr = a
elif hasattr(a, '__cuda_array_interface__'):
arr = _convert_object_with_cuda_array_interface(a)
elif hasattr(a, '__cupy_get_ndarray__'):
arr = a.__cupy_get_ndarray__()
else:
raise TypeError(
'Argument \'a\' has incorrect type (expected %s, got %s)' %
(cupy.ndarray, type(a)))
in_args = [arr]
dev_id = device.get_device_id()
_check_peer_access(arr, dev_id)
if out is None:
out_args = []
else:
_check_peer_access(out, dev_id)
out_args = [out]
reduce_dims = True
return self._call(
in_args, out_args,
arr._shape, axis, dtype, keepdims, reduce_dims, dev_id,
None, True, self._sort_reduce_axis)
cdef tuple _get_expressions_and_types(
self, list in_args, list out_args, dtype):
cdef _kernel._Op op
op = self._ops.guess_routine(
self.name, self._routine_cache, in_args, dtype, self._ops)
map_expr, reduce_expr, post_map_expr, reduce_type = op.routine
if reduce_type is None:
reduce_type = _get_typename(op.out_types[0])
if out_args:
out_type = out_args[0].dtype.type
else:
out_type = op.out_types[0]
# We guessed a routine that requires a C2R casting for the input
if (in_args[0].dtype.kind == 'c'
and numpy.dtype(op.in_types[0]).kind == 'f'):
warnings.warn(
'Casting complex values to real discards the imaginary part',
numpy.ComplexWarning)
in_args[0] = in_args[0].real
type_map = _kernel._TypeMap((
('type_in0_raw', in_args[0].dtype.type),
('type_out0_raw', out_type),
))
return (
map_expr, reduce_expr, post_map_expr,
op.in_types, op.out_types, reduce_type,
type_map)
cdef list _get_out_args(
self, list out_args, tuple out_types, const shape_t& out_shape):
return _get_out_args_from_optionals(
cupy.ndarray, out_args, out_types, out_shape, 'unsafe', None)
cdef function.Function _get_function(
self,
tuple params, tuple arginfos, _kernel._TypeMap type_map,
str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
Py_ssize_t block_size):
in_types = []
for x in arginfos:
if x.type is cupy.ndarray:
in_types.append(cupy.dtype(x.dtype).char)
in_types = tuple(in_types)
if in_types not in self._cached_codes:
code = _SimpleReductionKernel_get_cached_function_code(
map_expr, reduce_expr, post_map_expr, reduce_type,
params, arginfos, type_map,
self.name, block_size, self.identity,
self._input_expr, self._output_expr, self.preamble, ())
self._cached_codes[in_types] = code
return _SimpleReductionKernel_get_cached_function(
map_expr, reduce_expr, post_map_expr, reduce_type,
params, arginfos, type_map,
self.name, block_size, self.identity,
self._input_expr, self._output_expr, self.preamble, ())
@_util.memoize()
def _SimpleReductionKernel_get_cached_function_code(
map_expr, reduce_expr, post_map_expr, reduce_type,
params, arginfos, _kernel._TypeMap type_map,
name, block_size, identity, input_expr, output_expr, preamble,
options):
return _create_reduction_function_code(
name, block_size, reduce_type, params, arginfos, identity,
map_expr, reduce_expr, post_map_expr,
type_map, input_expr, output_expr, preamble, options)
@_util.memoize(for_each_device=True)
def _SimpleReductionKernel_get_cached_function(
map_expr, reduce_expr, post_map_expr, reduce_type,
params, arginfos, _kernel._TypeMap type_map,
name, block_size, identity, input_expr, output_expr, preamble,
options):
return _create_reduction_function(
name, block_size, reduce_type, params, arginfos, identity,
map_expr, reduce_expr, post_map_expr,
type_map, input_expr, output_expr, preamble, options)
# -----------------------------------------------------------------------------
# ReductionKernel
# -----------------------------------------------------------------------------
cdef class ReductionKernel(_AbstractReductionKernel):
"""User-defined reduction kernel.
This class can be used to define a reduction kernel with or without
broadcasting.
The kernel is compiled at an invocation of the
:meth:`~ReductionKernel.__call__` method, which is cached for each device.
The compiled binary is also cached into a file under the
``$HOME/.cupy/kernel_cache/`` directory with a hashed file name. The cached
binary is reused by other processes.
Args:
in_params (str): Input argument list.
out_params (str): Output argument list.
map_expr (str): Mapping expression for input values.
reduce_expr (str): Reduction expression.
post_map_expr (str): Mapping expression for reduced values.
identity (str): Identity value for starting the reduction.
name (str): Name of the kernel function. It should be set for
readability of the performance profiling.
reduce_type (str): Type of values to be used for reduction. This type
is used to store the special variables ``a``.
reduce_dims (bool): If ``True``, input arrays are reshaped without copy
to smaller dimensions for efficiency.
preamble (str): Fragment of the CUDA-C/C++ code that is inserted at the
top of the cu file.
options (tuple of str): Additional compilation options.
"""
def __init__(self, str in_params, str out_params,
map_expr, reduce_expr, post_map_expr,
identity, name='reduce_kernel', reduce_type=None,
reduce_dims=True, preamble='', options=()):
if not compiler.is_valid_kernel_name(name):
raise ValueError(
'Invalid kernel name: "%s"' % name)
super().__init__(
name,
'' if identity is None else str(identity),
in_params,
out_params,
)
self.nin = len(self.in_params)
self.nout = len(self.out_params)
self.nargs = self.nin + self.nout
self.reduce_expr = reduce_expr
self.map_expr = map_expr
self.post_map_expr = post_map_expr
self.options = options
self.reduce_dims = reduce_dims
if reduce_type is None:
self.reduce_type = self.out_params[0].ctype
else:
self.reduce_type = reduce_type
self.preamble = preamble
def __call__(self, *args, **kwargs):
"""Compiles and invokes the reduction kernel.
The compilation runs only if the kernel is not cached. Note that the
kernels with different argument dtypes, ndims, or axis are not
compatible. It means that single ReductionKernel object may be compiled
into multiple kernel binaries.
Args:
args: Arguments of the kernel.
out (cupy.ndarray): The output array. This can only be specified if
``args`` does not contain the output array.
axis (int or tuple of ints): Axis or axes along which the
reduction is performed.
keepdims (bool): If ``True``, the specified axes are remained as
axes of length one.
stream (cupy.cuda.Stream, optional): The CUDA stream to launch the
kernel on. If not given, the current stream will be used.
Returns:
Arrays are returned according to the ``out_params`` argument of the
``__init__`` method.
"""
cdef shape_t broad_shape
out = kwargs.pop('out', None)
axis = kwargs.pop('axis', None)
keepdims = kwargs.pop('keepdims', False)
stream = kwargs.pop('stream', None)
if kwargs:
raise TypeError('Wrong arguments %s' % kwargs)
n_args = len(args)
if n_args != self.nin and n_args != self.nargs:
raise TypeError('Wrong number of arguments for %s' % self.name)
out_args = list(args[self.nin:])
if out is not None:
if self.nout != 1:
raise NotImplementedError('')
if len(out_args) != 0:
raise ValueError("cannot specify 'out' as both "
"a positional and keyword argument")
out_args = [out]
dev_id = device.get_device_id()
in_args = _preprocess_args(dev_id, args[:self.nin], False)
out_args = _preprocess_args(dev_id, out_args, False)
in_args = _broadcast(in_args, self.in_params, False, broad_shape)
return self._call(
in_args, out_args,
broad_shape, axis, None,
keepdims, self.reduce_dims, dev_id, stream, True, True)
cdef tuple _get_expressions_and_types(
self, list in_args, list out_args, dtype):
in_ndarray_types = tuple(
[a.dtype.type if isinstance(a, _ndarray_base) else None
for a in in_args])
out_ndarray_types = tuple(
[a.dtype.type if isinstance(a, _ndarray_base) else None
for a in out_args])
in_types, out_types, type_map = _decide_params_type(
self.in_params, self.out_params,
in_ndarray_types, out_ndarray_types)
return (
self.map_expr, self.reduce_expr, self.post_map_expr,
in_types, out_types, self.reduce_type,
type_map)
cdef list _get_out_args(
self, list out_args, tuple out_types, const shape_t& out_shape):
return _get_out_args_with_params(
out_args, out_types, out_shape, self.out_params, False)
cdef function.Function _get_function(
self,
tuple params, tuple arginfos, _kernel._TypeMap type_map,
str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
Py_ssize_t block_size):
in_types = []
for x in arginfos:
if x.type is cupy.ndarray:
in_types.append(cupy.dtype(x.dtype).char)
in_types = tuple(in_types)
if in_types not in self._cached_codes:
code =_ReductionKernel_get_cached_function_code(
self.nin, self.nout, params, arginfos, type_map,
self.name, block_size, reduce_type, self.identity,
map_expr, reduce_expr, post_map_expr,
self.preamble, self.options)
self._cached_codes[in_types] = code
return _ReductionKernel_get_cached_function(
self.nin, self.nout, params, arginfos, type_map,
self.name, block_size, reduce_type, self.identity,
map_expr, reduce_expr, post_map_expr,
self.preamble, self.options)
@_util.memoize()
def _ReductionKernel_get_cached_function_code(
nin, nout, params, arginfos, _kernel._TypeMap type_map,
name, block_size, reduce_type, identity, map_expr, reduce_expr,
post_map_expr, preamble, options):
cdef ParameterInfo p
cdef _ArgInfo arginfo
in_arrays = [
p for p, arginfo in zip(params[:nin], arginfos[:nin])
if not p.raw and arginfo.is_ndarray()]
out_arrays = [
p for p, arginfo in zip(params[nin:nin+nout], arginfos[nin:nin+nout])
if not p.raw and arginfo.is_ndarray()]
input_expr = '\n'.join(
[(('const {0} {1}' if p.is_const else '{0}& {1}') +
' = _raw_{1}[_in_ind.get()];').format(p.ctype, p.name)
for p in in_arrays])
output_expr = '\n'.join(
['{0} &{1} = _raw_{1}[_out_ind.get()];'.format(p.ctype, p.name)
for p in out_arrays if not p.is_const])
return _create_reduction_function_code(
name, block_size, reduce_type, params, arginfos, identity,
map_expr, reduce_expr, post_map_expr,
type_map, input_expr, output_expr, preamble, options)
@_util.memoize(for_each_device=True)
def _ReductionKernel_get_cached_function(
nin, nout, params, arginfos, _kernel._TypeMap type_map,
name, block_size, reduce_type, identity, map_expr, reduce_expr,
post_map_expr, preamble, options):
code = _ReductionKernel_get_cached_function_code(
nin, nout, params, arginfos, type_map,
name, block_size, reduce_type, identity, map_expr, reduce_expr,
post_map_expr, preamble, options)
return _create_reduction_function_from_code(name, code, options)
cdef object _bitwise_and
cdef object _bitwise_or
cdef object _bitwise_xor
cdef object _invert
cdef object _left_shift
cdef object _right_shift
from cupy._core._kernel import create_ufunc
cdef _create_bit_op(name, op, no_bool, doc='', scatter_op=None):
types = () if no_bool else ('??->?',)
return create_ufunc(
'cupy_' + name,
types + ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
'LL->L', 'qq->q', 'QQ->Q'),
'out0 = in0 %s in1' % op,
doc=doc, scatter_op=scatter_op)
cdef _bitwise_and = _create_bit_op(
'bitwise_and', '&', False,
'''Computes the bitwise AND of two arrays elementwise.
Only integer and boolean arrays are handled.
.. seealso:: :data:`numpy.bitwise_and`
''',
scatter_op='and')
cdef _bitwise_or = _create_bit_op(
'bitwise_or', '|', False,
'''Computes the bitwise OR of two arrays elementwise.
Only integer and boolean arrays are handled.
.. seealso:: :data:`numpy.bitwise_or`
''',
scatter_op='or')
cdef _bitwise_xor = _create_bit_op(
'bitwise_xor', '^', False,
'''Computes the bitwise XOR of two arrays elementwise.
Only integer and boolean arrays are handled.
.. seealso:: :data:`numpy.bitwise_xor`
''',
scatter_op='xor')
cdef _invert = create_ufunc(
'cupy_invert',
(('?->?', 'out0 = !in0'), 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I',
'l->l', 'L->L', 'q->q', 'Q->Q'),
'out0 = ~in0',
doc='''Computes the bitwise NOT of an array elementwise.
Only integer and boolean arrays are handled.
.. note::
:func:`cupy.bitwise_not` is an alias for :func:`cupy.invert`.
.. seealso:: :data:`numpy.invert`
''')
cdef _left_shift = _create_bit_op(
'left_shift', '<<', True,
'''Shifts the bits of each integer element to the left.
Only integer arrays are handled.
.. seealso:: :data:`numpy.left_shift`
''')
cdef _right_shift = _create_bit_op(
'right_shift', '>>', True,
'''Shifts the bits of each integer element to the right.
Only integer arrays are handled
.. seealso:: :data:`numpy.right_shift`
''')
# Variables to expose to Python
# (cythonized data cannot be exposed to Python, even with cpdef.)
bitwise_and = _bitwise_and
bitwise_or = _bitwise_or
bitwise_xor = _bitwise_xor
invert = _invert
left_shift = _left_shift
right_shift = _right_shift
from cupy._core.core cimport _ndarray_base
cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self)
cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices)
cdef _ndarray_setitem(_ndarray_base self, slices, value)
cdef tuple _ndarray_nonzero(_ndarray_base self)
cdef _scatter_op(_ndarray_base a, slices, value, op)
cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out)
cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode)
cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode)
cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out)
cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2)
cdef _ndarray_base _add_reduceat(
_ndarray_base array, indices, axis, dtype, out)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment