Commit deb763b7 authored by root's avatar root
Browse files

clone code from github

parent 93bf084b
Pipeline #3386 canceled with stages
import numpy
from cupy._core._dtype import get_dtype
import cupy
from cupy._core import _fusion_thread_local
from cupy._core import core
from cupy._core._scalar import get_typename
_thread_local = _fusion_thread_local.thread_local
_dtype_to_astype_dict = None
def _set_dtype_to_astype_dict():
"""Set a dict with dtypes and astype ufuncs to `_dtype_to_astype_dict`.
Creates a ufunc for type cast operations, and set a dict with keys
as the dtype of the output array and values as astype ufuncs.
This function is called at most once.
"""
global _dtype_to_astype_dict
_dtype_to_astype_dict = {}
dtype_list = [numpy.dtype(type_char) for type_char in '?bhilqBHILQefdFD']
for t in dtype_list:
name = 'astype_{}'.format(t)
rules = tuple(['{}->{}'.format(s.char, t.char) for s in dtype_list])
command = 'out0 = static_cast< {} >(in0)'.format(get_typename(t))
_dtype_to_astype_dict[t] = core.create_ufunc(name, rules, command)
class _VariableProxy:
"""Abstracted array/scalar object passed to the target function.
"""
def __init__(self, content):
assert isinstance(content, cupy._core._fusion_variable._TraceVariable)
self.content = content
def __neg__(self):
return cupy.negative(self)
def __add__(self, other):
return cupy.add(self, other)
def __radd__(self, other):
return cupy.add(other, self)
def __sub__(self, other):
return cupy.subtract(self, other)
def __rsub__(self, other):
return cupy.subtract(other, self)
def __mul__(self, other):
return cupy.multiply(self, other)
def __rmul__(self, other):
return cupy.multiply(other, self)
def __div__(self, other):
return cupy.divide(self, other)
def __rdiv__(self, other):
return cupy.divide(other, self)
def __truediv__(self, other):
return cupy.true_divide(self, other)
def __rtruediv__(self, other):
return cupy.true_divide(other, self)
def __floordiv__(self, other):
return cupy.floor_divide(self, other)
def __rfloordiv__(self, other):
return cupy.floor_divide(other, self)
def __mod__(self, other):
return cupy.remainder(self, other)
def __rmod__(self, other):
return cupy.remainder(other, self)
def __pow__(self, other):
return cupy.power(self, other)
def __lshift__(self, other):
return cupy.left_shift(self, other)
def __rlshift__(self, other):
return cupy.left_shift(other, self)
def __rshift__(self, other):
return cupy.right_shift(self, other)
def __rrshift__(self, other):
return cupy.right_shift(other, self)
def __invert__(self):
return cupy.invert(self)
def __and__(self, other):
return cupy.bitwise_and(self, other)
def __rand__(self, other):
return cupy.bitwise_and(other, self)
def __or__(self, other):
return cupy.bitwise_or(self, other)
def __ror__(self, other):
return cupy.bitwise_or(other, self)
def __xor__(self, other):
return cupy.bitwise_xor(self, other)
def __rxor__(self, other):
return cupy.bitwise_xor(other, self)
def __lt__(self, other):
return cupy.less(self, other)
def __le__(self, other):
return cupy.less_equal(self, other)
def __eq__(self, other):
return cupy.equal(self, other)
def __ne__(self, other):
return cupy.not_equal(self, other)
def __ge__(self, other):
return cupy.greater_equal(self, other)
def __gt__(self, other):
return cupy.greater(self, other)
def copy(self):
return cupy.copy(self)
def astype(self, dtype, order=None, casting=None, subok=None, copy=True):
dtype = get_dtype(dtype)
if order is not None:
raise TypeError('order is not supported yet')
if casting is not None:
raise TypeError('casting is not supported yet')
if subok is not None:
raise TypeError('subok is not supported yet')
if not copy and self.dtype == dtype:
return self
if _dtype_to_astype_dict is None:
_set_dtype_to_astype_dict()
return _dtype_to_astype_dict[dtype](self)
def sum(self, axis=None, dtype=None, out=None, keepdims=False):
return cupy.sum(
self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
def prod(self, axis=None, dtype=None, out=None, keepdims=False):
return cupy.prod(
self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
def max(self, axis=None, out=None, keepdims=False):
return cupy.max(self, axis=axis, out=out, keepdims=keepdims)
def min(self, axis=None, out=None, keepdims=False):
return cupy.min(self, axis=axis, out=out, keepdims=keepdims)
def all(self, axis=None, out=None, keepdims=False):
return cupy.all(self, axis=axis, out=out, keepdims=keepdims)
def any(self, axis=None, out=None, keepdims=False):
return cupy.any(self, axis=axis, out=out, keepdims=keepdims)
@property
def dtype(self):
return self.content.dtype
@property
def ndim(self):
return self.content.ndim
@property
def shape(self):
raise NotImplementedError('`shape` is not supported, currently.')
class _ScalarProxy(_VariableProxy):
"""An abstracted scalar object passed to the target function.
Attributes:
dtype(dtype): The dtype of the array.
imag(_ArrayProxy): The imaginary part of the array (Not implemented)
real(_ArrayProxy): The real part of the array (Not implemented)
ndim(int): The number of dimensions of the array.
"""
def __repr__(self):
return '_ScalarProxy({}, dtype={})'.format(
self._emit_param_name(), self.dtype)
class _ArrayProxy(_VariableProxy):
"""An abstracted array object passed to the target function.
Attributes:
dtype(dtype): The dtype of the array.
imag(_ArrayProxy): The imaginary part of the array (Not implemented)
real(_ArrayProxy): The real part of the array (Not implemented)
ndim(int): The number of dimensions of the array.
"""
def __repr__(self):
return '_ArrayProxy([...], dtype=\'{}\', ndim={})'.format(
self.dtype.char, self.ndim)
def _inplace_op(self, ufunc, other):
return ufunc(self, other, self)
def __iadd__(self, other):
return self._inplace_op(cupy.add, other)
def __isub__(self, other):
return self._inplace_op(cupy.subtract, other)
def __imul__(self, other):
return self._inplace_op(cupy.multiply, other)
def __idiv__(self, other):
return self._inplace_op(cupy.divide, other)
def __itruediv__(self, other):
return self._inplace_op(cupy.true_divide, other)
def __ifloordiv__(self, other):
return self._inplace_op(cupy.floor_divide, other)
def __imod__(self, other):
return self._inplace_op(cupy.remainder, other)
def __ipow__(self, other):
return self._inplace_op(cupy.power, other)
def __ilshift__(self, other):
return self._inplace_op(cupy.left_shift, other)
def __irshift__(self, other):
return self._inplace_op(cupy.right_shift, other)
def __iand__(self, other):
return self._inplace_op(cupy.bitwise_and, other)
def __ior__(self, other):
return self._inplace_op(cupy.bitwise_or, other)
def __ixor__(self, other):
return self._inplace_op(cupy.bitwise_xor, other)
def __getitem__(self, index):
return _fusion_thread_local.call_indexing(self, index)
def __setitem__(self, slices, value):
if slices is Ellipsis or (
isinstance(slices, slice) and slices == slice(None)):
_fusion_thread_local.call_ufunc(
core.elementwise_copy, value, out=self)
else:
raise ValueError('The fusion supports `[...]` or `[:]`.')
import itertools
import string
from libcpp cimport vector
from cupy._core cimport _carray
from cupy._core.core cimport _ndarray_init
from cupy._core.core cimport compile_with_cache
from cupy._core.core cimport _ndarray_base
from cupy._core cimport internal
from cupy._core cimport _routines_manipulation as _manipulation
from cupy_backends.cuda.api cimport driver
from cupy_backends.cuda.api cimport runtime
import cupy as _cupy
from cupy._core import _dtype
from cupy import _util
from cupy._core import _codeblock
from cupy._core import _fusion_op
from cupy._core._fusion_variable import _TraceVariable
from cupy._core._fusion_variable import _TraceScalar
from cupy._core._fusion_variable import _TraceArray
cdef Py_ssize_t _default_block_size = (
256 if runtime._is_hip_environment else 512)
@_util.memoize(for_each_device=True)
def _cuda_compile(preamble, name, cuda_params, cuda_body, use_grid_sync):
template = (
'${preamble}\n\n'
'extern "C" __global__ void ${name}(${cuda_params}) ${cuda_body}\n'
)
if use_grid_sync:
template = '#include <cooperative_groups.h>\n\n' + template
code = string.Template(template).substitute(
preamble=preamble,
name=name,
cuda_params=cuda_params,
cuda_body=cuda_body)
# (For contributers) We can view the whole generated CUDA code
# by uncommenting the following line.
# print(code)
module = compile_with_cache(
code, (), None, None, True, 'nvrtc', False, use_grid_sync)
return module.get_function(name)
cdef class FusedKernel:
cdef:
readonly object shape_constraints
readonly str _name
readonly list _params
readonly int _return_size
readonly str _submodule_code
readonly str _cuda_body
readonly dict _cuda_params_memo
readonly list _block_strides
readonly bint _use_grid_sync
readonly list _reduction_in_array
readonly list _reduction_out_array
readonly vector.vector[bint] _is_base
readonly list _dtypes
readonly vector.vector[Py_ssize_t] _input_index
readonly vector.vector[Py_ssize_t] _view_of
readonly vector.vector[Py_ssize_t] _out_params
def __init__(self, name, trace_result):
op_list = trace_result.op_list
params = trace_result.params
return_size = trace_result.return_size
self.shape_constraints = trace_result.shape_constraints
self._name = name
self._params = sorted(params, key=lambda x: x.serial_number)
self._cuda_params_memo = {}
# Generate the device functions.
submodule_code = '\n\n'.join(set(itertools.chain.from_iterable([
op.emit_preamble_codes() for op in op_list]))) + '\n\n'
submodule_code += '\n\n'.join(itertools.chain.from_iterable([
op.emit_submodule_codes() for op in op_list]))
# Generate the function body of a __global__ function.
codes = []
self._use_grid_sync = len(op_list) > 1
if self._use_grid_sync:
codes.append('namespace _cg = cooperative_groups;')
codes.append('_cg::grid_group _grid = _cg::this_grid();')
for i, op in enumerate(op_list):
if i > 0:
codes.append('_cg::sync(_grid);')
codes.append(op.emit_code())
self._submodule_code = submodule_code
self._cuda_body = str(_codeblock.CodeBlock('', codes))
# Check the format of the return value.
if return_size == 'none':
self._return_size = -1
self._out_params.resize(0)
elif return_size == 'single':
self._return_size = -2
self._out_params.resize(1)
else:
assert isinstance(return_size, int)
assert return_size >= 0
self._return_size = return_size
self._out_params.resize(return_size)
for p in self._params:
assert isinstance(p, _TraceVariable)
# Analyse the relationship between variables.
array_dict = {}
self._reduction_in_array = []
self._reduction_out_array = []
self._dtypes = []
for i, p in enumerate(self._params):
view_of = -1
input_index = -1
if p.input_index is not None:
input_index = p.input_index
if isinstance(p, _TraceArray):
if p._view_of is not None:
view_of = array_dict[p._view_of.key()]
if p.is_output:
self._out_params[p.output_index] = i
array_dict[p.key()] = i
self._is_base.push_back(p.is_base)
self._dtypes.append(_dtype.get_dtype(p.dtype))
self._input_index.push_back(input_index)
self._view_of.push_back(view_of)
self._block_strides = []
for op in op_list:
if isinstance(op, _fusion_op._ReductionTraceOp):
self._reduction_in_array.append(
array_dict[op.in_params.item().key()])
self._reduction_out_array.append(
array_dict[op.out_params.item().key()])
self._block_strides.append(
'int {}'.format(op.block_stride_name))
def get_shapes_of_kernel_params(self, tuple args):
"""Returns the shapes of parameters passed to kern.linear_launch.
"""
cdef list kernel_param_shapes = []
cdef int axis
cdef list shape
for param in self._params:
shape = []
if isinstance(param, _TraceArray):
ashape = param.ashape
for axis in range(len(ashape)):
dim = ashape[axis]
if not isinstance(dim, int):
dim = args[dim.input_index].shape[dim.axis]
shape.append(dim)
kernel_param_shapes.append(tuple(shape))
return kernel_param_shapes
cdef list _get_ndarray_list(self, tuple args, list shapes):
"""Get the list of ndarray corresponding to ``self._params``.
"""
cdef list ndarray_list = []
cdef list params = self._params
cdef int i
for i in range(len(params)):
param = params[i]
shape = shapes[i]
if self._input_index[i] >= 0:
array = args[<Py_ssize_t>self._input_index[i]]
elif isinstance(param, _TraceScalar):
array = None
elif self._is_base[i]:
array = _ndarray_init(
_cupy.ndarray, shape, self._dtypes[i], None)
else:
view_of = ndarray_list[<Py_ssize_t>self._view_of[i]]
if param.is_broadcast:
array = _manipulation.broadcast_to(view_of, shape)
elif param.slice_key is not None:
array = view_of[param.slice_key]
elif param.rotate_axis is not None:
axis_permutes = list(param.rotate_axis)
for i in range(param.ndim):
if i not in param.rotate_axis:
axis_permutes.append(i)
axis_permutes = tuple(axis_permutes)
array = _manipulation._transpose(view_of, axis_permutes)
else:
assert False
# For debug
# if isinstance(array, ndarray) and param.rotate_axis is None:
# assert array.shape == shape, (array.shape, shape)
ndarray_list.append(array)
return ndarray_list
cdef object _get_return_value(self, list ndarray_list):
"""Get the return value of ``self.execute``.
"""
cdef int i
if self._return_size == -1:
return None
if self._return_size == -2:
return ndarray_list[<Py_ssize_t>self._out_params[0]]
return tuple([
ndarray_list[<Py_ssize_t>self._out_params[i]]
for i in range(self._return_size)
])
cdef tuple _get_kernel_size(self, list ndarray_list):
"""Calculate the numnber of contiguous blocks in non-reduction axes
of input arrays, and set them to ``self._contiguous_size``.
"""
cdef _ndarray_base in_array, out_array
cdef Py_ssize_t block_size, block_stride, contiguous_size
cdef list block_strides = []
if len(self._reduction_in_array) == 0:
return [], 256, 0
block_size = _default_block_size
for i in range(len(self._reduction_in_array)):
in_array = ndarray_list[self._reduction_in_array[i]]
out_array = ndarray_list[self._reduction_out_array[i]]
# TODO(asi1024): Fix block strides for performance.
contiguous_size = 1
itemsize = in_array.dtype.itemsize
for i in range(out_array.ndim):
if in_array.strides[-i-1] != contiguous_size * itemsize:
break
contiguous_size *= in_array.shape[-i-1]
contiguous_size = min(contiguous_size, 32)
reduce_block_size = max(1, in_array.size // max(1, out_array.size))
block_stride = max(
contiguous_size, block_size // reduce_block_size)
block_stride = internal.clp2(block_stride // 2 + 1) # floor
block_strides.append(block_stride)
shared_mem = block_size * 32 # max bytesize of reduce_ctype.
return block_strides, block_size, shared_mem
cdef tuple _reduce_dims(self, list ndarray_list):
"""Reduce number of dimensions of ndarrays and returns the cache key.
"""
cdef list params = self._params
cdef list ndims = []
cdef _ndarray_base array
cdef int i
for i in range(len(params)):
param = params[i]
if param.ndim <= 1:
continue
array = ndarray_list[i]
array = array.reduced_view()
ndarray_list[i] = array
ndims.append(array.ndim)
return tuple(ndims)
cdef list _get_inout_args(self, tuple args, list ndarray_list):
"""Get the arguments passed to ``kern.linear_launch``.
"""
cdef list params = []
cdef list indexers = []
cdef _carray.Indexer indexer
for i in range(len(self._params)):
array = ndarray_list[i]
if isinstance(array, _ndarray_base):
indexer = _carray.Indexer.__new__(_carray.Indexer)
indexer.init(array._shape)
indexers.append(indexer)
params.append(array)
elif self._input_index[i] >= 0:
obj = args[<Py_ssize_t>self._input_index[i]]
params.append(obj)
return params + indexers
cdef str _get_cuda_params(self, tuple key, list ndarray_list):
"""Get a string of parameters of CUDA main function code.
"""
cdef int i
if key in self._cuda_params_memo:
return self._cuda_params_memo[key]
cuda_params = []
indexers = []
for i in range(len(self._params)):
a = self._params[i]
if isinstance(a, _TraceArray):
array = ndarray_list[i]
ndim = array.ndim
c_contiguous = 'true' if array._c_contiguous else 'false'
index_32_bits = 'true' if array._index_32_bits else 'false'
cuda_params.append(a.format(
'CArray<${type}, ${ndim}, ${cont}, ${ind32}> ${var}',
ndim=ndim, cont=c_contiguous, ind32=index_32_bits))
indexers.append(
a.format('CIndexer<${ndim}> ${indexer}', ndim=ndim))
elif isinstance(a, _TraceScalar):
if a.const_value is None:
cuda_params.append(a.format('${type} ${var}'))
else:
raise TypeError('Unknown type {}.'.format(type(a)))
ret = cuda_params + indexers + self._block_strides
ret = ', '.join(ret)
self._cuda_params_memo[key] = ret
return ret
def execute(self, tuple args, list shapes):
ndarray_list = self._get_ndarray_list(args, shapes)
ret = self._get_return_value(ndarray_list)
reduce_key = self._reduce_dims(ndarray_list)
inout_args = self._get_inout_args(args, ndarray_list)
cuda_params = self._get_cuda_params(reduce_key, ndarray_list)
kern = _cuda_compile(
self._submodule_code, self._name, cuda_params, self._cuda_body,
self._use_grid_sync)
block_strides, block_size, shared_mem = (
self._get_kernel_size(ndarray_list))
# TODO(asi1024): Optimize kernel size parameter.
if not runtime._is_hip_environment:
kern_size = driver.occupancyMaxActiveBlocksPerMultiprocessor(
kern.ptr, block_size, shared_mem) * block_size
else:
# In HIP sometimes the occupancy calc seems to be broken
kern_size = block_size * 512
kargs = inout_args + block_strides
kern.linear_launch(
kern_size, kargs, shared_mem, block_size,
enable_cooperative_groups=self._use_grid_sync)
return ret
import string
import numpy
from cupy._core import _codeblock
from cupy._core._fusion_variable import _TraceVariable
from cupy._core._fusion_variable import _TraceArray
from cupy._core._fusion_variable import _VariableSet
from cupy._core import _fusion_thread_local
from cupy._core import _kernel
from cupy._core import _reduction
from cupy._core._scalar import get_typename
class _UfuncRoutine:
"""A device function for single elementwise operations.
"""
def __init__(
self, name, ufunc, routine_code, in_params, out_params,
compute_dtypes):
assert isinstance(name, str)
assert isinstance(ufunc, _kernel.ufunc)
assert isinstance(routine_code, str)
assert isinstance(compute_dtypes, tuple)
assert all(isinstance(t, numpy.dtype) for t in compute_dtypes)
assert isinstance(in_params, list)
assert all(isinstance(p, _TraceVariable) for p in in_params)
assert isinstance(out_params, list)
assert all(isinstance(p, _TraceArray) for p in out_params)
self.name = name
self.in_params = in_params
self.out_params = out_params
self.preamble = ufunc._preamble
self.routine_code = routine_code
self.compute_dtypes = compute_dtypes
def emit_code(self):
"""Returns a CUDA device function code.
Returns a string like:
```
__device__ void cupy_add_0(int &in0_, float &in1_, double &out0_) {
typedef double in0_type;
typedef double in1_type;
typedef double out0_type;
double in0 = (double) in0_;
double in1 = (double) in1_;
double out0 = (double) out0_;
out0 = in0 + in1;
out0_ = out0;
}
```
"""
nin = len(self.in_params)
dtypes = self.compute_dtypes
assert len(self.in_params) == len(self.compute_dtypes[:nin])
in_params = [
(get_typename(p.dtype), get_typename(t), 'in{}'.format(i))
for i, (p, t) in enumerate(zip(self.in_params, dtypes[:nin]))
]
out_params = [
(get_typename(p.dtype), get_typename(t), 'out{}'.format(i))
for i, (p, t) in enumerate(zip(self.out_params, dtypes[nin:]))
]
params = in_params + out_params
params_code = ', '.join(['{} &{}_'.format(t, s) for t, _, s in params])
typedef = ['typedef {} {}_type;'.format(t, s) for _, t, s in params]
read = ['{} {} = ({}) {}_;'.format(t, s, t, s) for _, t, s in params]
write = ['{}_ = {};'.format(s, s) for _, _, s in out_params]
return _codeblock.CodeBlock(
'__device__ void {}({})'.format(self.name, params_code),
typedef + read + [self.routine_code + ';'] + write)
def emit_call_code(self):
params = self.in_params + self.out_params
return '{op_name}({params});'.format(
op_name=self.name,
params=', '.join([var.lvar_name for var in params]))
class _ElementwiseTraceOp:
"""Ufunc or elementwise kernel with types.
"""
def __init__(self, ufunc_routines, in_params, out_params, ashape):
# The `in_params` and `out_params` should be already broadcasted to
# `ashape`, but they don't guarantee to be exactly same as
# `param.ashape`.
_fusion_thread_local.check_not_runtime()
assert isinstance(ufunc_routines, list)
assert all(isinstance(r, _UfuncRoutine) for r in ufunc_routines)
assert isinstance(ashape, tuple)
self.ops = ufunc_routines
self.in_params = _VariableSet(*in_params)
self.out_params = _VariableSet(*out_params)
self.ashape = ashape
@property
def params(self):
"""Returns the set of all variable the loop uses.
"""
res = _VariableSet()
for op in self.ops:
res += _VariableSet(*op.in_params)
res += _VariableSet(*op.out_params)
return res
@staticmethod
def _emit_declaration(params, in_params):
"""Returns a tuple of size 2.
1. CUDA code: declaring local variables.
2. The set of arrays which require indexer.
"""
_fusion_thread_local.check_not_runtime()
indexed_arrays = _VariableSet()
code = []
for var in params:
if var in in_params:
if isinstance(var, _TraceArray):
indexed_arrays.add(var)
f = '${type} ${lvar} = ${var}[${indexer}.get()];'
else:
f = '${type} ${lvar} = ${var};'
else:
f = '${type} ${lvar};'
code.append(var.format(f))
return code, indexed_arrays
@staticmethod
def _emit_after_operation(out_params):
"""Returns a tuple of size 2.
1. CUDA code: writing the results of operations back to global memory.
2. The set of arrays which require indexer.
"""
_fusion_thread_local.check_not_runtime()
indexed_arrays = _VariableSet()
codes = []
for var in out_params:
if isinstance(var, _TraceArray):
indexed_arrays.add(var)
f = '${var}[${indexer}.get()] = ${lvar};'
else:
f = '${var} = ${lvar};'
codes.append(var.format(f))
return codes, indexed_arrays
@staticmethod
def _emit_set_index(indexed_params, tid):
"""Returns a CUDA code: setting a raw index to indexers.
"""
_fusion_thread_local.check_not_runtime()
assert isinstance(indexed_params, _VariableSet)
return [
p.format('${indexer}.set(${tid});', tid=tid)
for p in indexed_params
]
def emit_code(self):
_fusion_thread_local.check_not_runtime()
declaration, s1 = self._emit_declaration(self.params, self.in_params)
operation = [op.emit_call_code() for op in self.ops]
after_operation, s2 = self._emit_after_operation(self.out_params)
index_name = 'i'
indexed_array = s1 + s2
indexer_name = next(iter(indexed_array)).indexer_name
indexer_setup = self._emit_set_index(indexed_array, index_name)
return _codeblock.CodeBlock(
'CUPY_FOR({}, {}.size())'.format(index_name, indexer_name),
indexer_setup + declaration + operation + after_operation)
def emit_preamble_codes(self):
return [subm.preamble for subm in self.ops if subm.preamble != '']
def emit_submodule_codes(self):
return [str(subm.emit_code()) for subm in self.ops]
class _ReductionTraceOp:
def __init__(self, name, reduce_func, expr, in_param, out_param, axis):
"""Reduction operation.
"""
_fusion_thread_local.check_not_runtime()
assert isinstance(name, str)
assert isinstance(reduce_func, _reduction._SimpleReductionKernel)
assert isinstance(in_param, _TraceArray)
assert isinstance(out_param, _TraceArray)
assert isinstance(axis, tuple)
assert all(0 <= x < in_param.ndim for x in axis)
self.name = name
self.preamble = reduce_func.preamble
self.in_params = _VariableSet(in_param)
self.out_params = _VariableSet(out_param)
self.block_stride_name = 'block_stride_' + name
self.axis = axis
if reduce_func.identity is None:
self.identity = ''
else:
self.identity = str(reduce_func.identity)
_, self.expr, self.postmap_cast_code, self.reduce_ctype = expr
if self.reduce_ctype is None:
out_param, = self.out_params
self.reduce_ctype = get_typename(out_param.dtype)
self.premap_op = None
self.postmap_op = None
@property
def params(self):
return self.in_params + self.out_params
def emit_code(self):
_fusion_thread_local.check_not_runtime()
assert len(self.in_params) == 1
assert len(self.out_params) == 1
in_param = list(self.in_params)[0]
out_param = list(self.out_params)[0]
params = ', '.join([
in_param.var_name,
out_param.var_name,
in_param.indexer_name,
out_param.indexer_name,
])
return '{}({}, {});'.format(
self.name, params, self.block_stride_name)
def emit_preamble_codes(self):
preamble = self.preamble
return [preamble] if preamble != '' else []
def emit_submodule_codes(self):
"""Returns a CUDA device function code.
The emitted code assumes that ``block_stride`` and `blockDim.x` is a
power of 2.
"""
in_param, = self.in_params
out_param, = self.out_params
op_name = '{}_op'.format(self.name)
postmap_name = '{}_postmap'.format(self.name)
template = string.Template('''
#define ${op_name}(a, b) (${reduce_expr})
#define ${postmap_name}(a, out0) (${postmap_cast})
template <typename InType, typename OutType, typename InIndexerType, typename OutIndexerType>
__device__ void ${name}(
InType in_arr, OutType out_arr,
InIndexerType in_ind, OutIndexerType out_ind, int block_stride) {
typedef ${in_type} type_in0_raw;
typedef ${out_type} type_out0_raw;
typedef ${reduce_ctype} _type_reduce;
extern __shared__ char _sdata_raw[];
_type_reduce *sdata = reinterpret_cast<_type_reduce*>(_sdata_raw);
unsigned int tid = threadIdx.x;
int _J = tid >> __popc(block_stride - 1);
ptrdiff_t _j = (ptrdiff_t)_J * out_ind.size();
int J_stride = blockDim.x >> __popc(block_stride - 1);
ptrdiff_t j_stride = (ptrdiff_t)J_stride * out_ind.size();
for (ptrdiff_t _i = (ptrdiff_t)blockIdx.x * block_stride; _i < out_ind.size(); _i += (ptrdiff_t)gridDim.x * block_stride) {
_type_reduce s = _type_reduce(${identity});
ptrdiff_t i = _i + (tid & (block_stride - 1));
for (ptrdiff_t j = i + _j; j < in_ind.size(); j += j_stride) {
in_ind.set(j);
s = ${op_name}(s, static_cast<_type_reduce>(in_arr[in_ind.get()]));
}
sdata[tid] = s;
__syncthreads();
for (unsigned int block = blockDim.x / 2; block >= block_stride; block >>= 1) {
if (tid < block) {
sdata[tid] = ${op_name}(sdata[tid], sdata[tid + block]);
}
__syncthreads();
}
if (tid < block_stride) {
s = sdata[tid];
}
if (tid < block_stride && i < out_ind.size()) {
out_ind.set(i);
${postmap_name}(s, out_arr[out_ind.get()]);
}
__syncthreads();
}
}''') # NOQA
code = template.substitute(
name=self.name,
op_name=op_name,
postmap_name=postmap_name,
in_type=get_typename(in_param.dtype),
out_type=get_typename(out_param.dtype),
reduce_ctype=self.reduce_ctype,
reduce_expr=self.expr,
identity=self.identity,
postmap_cast=self.postmap_cast_code
)
return [code]
from cupy._core import _fusion_variable
from cupy._core import _fusion_op
def _reduce_memory_access(ops):
required_memories = set()
for op in ops:
for p in op.in_params + op.out_params:
if p.memory.is_inout:
required_memories.add(p.memory)
for op in ops[::-1]:
in_memories = set([p.memory for p in op.in_params])
new_out_params = []
for p in op.out_params:
if p.memory in required_memories:
new_out_params.append(p)
op.out_params = _fusion_variable._VariableSet(*new_out_params)
# TODO(asi1024): The following improvement can be applicable only
# when the memory space is used at most once.
# `required_memories -= out_memories`
required_memories |= in_memories
return [op for op in ops if len(op.out_params) > 0]
def _normalize_ashapes(ops, variables, shape_constraints):
def normalize(shape):
return tuple([shape_constraints.evaluate(d) for d in shape])
for var in variables:
var.ashape = normalize(var.ashape)
for op in ops:
if isinstance(op, _fusion_op._ElementwiseTraceOp):
op.ashape = normalize(op.ashape)
def _fuse_two_ops(op1, op2):
"""Returns a fused Op if the two ops can be fused, and ``None`` otherwise.
"""
# TODO(asi1024): Supoort reduction postmap.
if not isinstance(op1, _fusion_op._ElementwiseTraceOp):
return None
# TODO(asi1024): Supoort reduction premap.
if not isinstance(op2, _fusion_op._ElementwiseTraceOp):
return None
if op1.ashape != op2.ashape:
return None
new_in_params = op1.in_params + (op2.in_params - op1.out_params)
new_out_params = op1.out_params + op2.out_params
for in_param in new_in_params:
for out_param in new_out_params:
# Checks if two arrays may share the same memory space.
if in_param.memory == out_param.memory and in_param != out_param:
return None
op1.ops.extend(op2.ops)
op1.in_params = new_in_params
op1.out_params = new_out_params
return op1
def _fuse_consecutive_ops(ops, shape_constraints):
res = []
for op in ops:
if len(res) == 0:
res.append(op)
else:
prev_op = res.pop(-1)
new_op = _fuse_two_ops(prev_op, op)
if new_op is None:
res.extend([prev_op, op])
else:
res.append(new_op)
return res
def optimize(ops, variables, shape_constraints):
_normalize_ashapes(ops, variables, shape_constraints)
ops = _reduce_memory_access(ops)
ops = _fuse_consecutive_ops(ops, shape_constraints)
ops = _reduce_memory_access(ops)
return ops
import threading
thread_local = threading.local()
cpdef inline bint is_old_fusing() except? -1:
try:
return thread_local.is_old_fusing
except AttributeError:
thread_local.is_old_fusing = False
return False
cpdef inline bint is_new_fusing() except? -1:
try:
return thread_local.is_new_fusing
except AttributeError:
thread_local.is_new_fusing = False
return False
cpdef inline bint is_fusing() except? -1:
return is_old_fusing() or is_new_fusing()
def check_not_runtime():
assert is_new_fusing()
def call_ufunc(fusion_op, *args, **kwargs):
if is_new_fusing():
return thread_local.history.call_ufunc(fusion_op, *args, **kwargs)
import cupy
return cupy._core.fusion._call_ufunc(fusion_op, *args, **kwargs)
def call_reduction(fusion_op, *args, **kwargs):
if is_new_fusing():
return thread_local.history.call_reduction(fusion_op, *args, **kwargs)
import cupy
return cupy._core.fusion._call_reduction(fusion_op, *args, **kwargs)
def call_indexing(fusion_op, *args, **kwargs):
return thread_local.history.call_indexing(fusion_op, *args, **kwargs)
This diff is collapsed.
cdef class _AbstractDim:
cdef:
readonly int input_index
readonly int axis
import string
import numpy
from cupy._core import _fusion_interface
from cupy._core._scalar cimport get_typename
cdef class _AbstractDim:
"""An abstracted data structure for a length of dimensions.
Attributes:
input_index (int):
The position of the element in the arguments passed to the
fused function
axis (int):
The index of dimensions
"""
def __init__(self, int input_index, int axis):
self.input_index = input_index
self.axis = axis
def __hash__(self):
return hash((self.input_index, self.axis))
def __eq__(self, object other):
if isinstance(other, int):
return False
return (
self.input_index == other.input_index
and self.axis == other.axis
)
class _MemorySpace:
"""A memory space object.
Attributes:
id(int): The serial number of memory space.
base_serial_number(int): The serial number of the base variable
which have this memory space.
is_input(bool): If this is set to ``True``, the memory space is
already allocated as an input array. If this is set to ``False``,
the memory space should be allocated before launching the kernel.
is_output(bool): If this is set to ``True``, the memory space is
used in the return values.
"""
def __init__(self, memory_id, base_serial_number):
assert isinstance(memory_id, int)
assert isinstance(base_serial_number, int)
self.id = memory_id
self.base_serial_number = base_serial_number
# Initially, these attributes are set to be `False`, but might be
# updated from outside.
self.is_input = False
self.is_output = False
@property
def is_inout(self):
"""Returns ``True`` if the memory space is used for inputs or outputs.
If ``True``, the memory space should not be deallocated just after
the kernel launch. If ``False``, the memory space is used only for
temporary value in the fused kernel."""
return self.is_input or self.is_output
class _TraceVariable:
"""Variable object to trace operations in the target function to be fused.
Attributes:
index(_MemorySpace): The memory space the variable uses.
serial_number(int): The serial number of the variable object.
dtype(dtype): The dtype of the variable.
rshape(tuple of int): The real shape of the variable.
ashape(tuple of _AbstractDim): An abstracted shape of the variable.
input_index(int or None): If not `None`, this variable is used as
the `input_index`-th input parameter.
output_index(int or None): If not `None`, this variable is used as
the `output_index`-th output parameter.
"""
def __init__(
self, memory_space, serial_number, dtype, rshape, ashape,
input_index, output_index):
assert isinstance(memory_space, _MemorySpace)
assert isinstance(serial_number, int)
assert isinstance(dtype, numpy.dtype)
assert input_index is None or isinstance(input_index, int)
assert output_index is None or isinstance(output_index, int)
assert isinstance(rshape, tuple)
assert isinstance(ashape, tuple)
assert len(rshape) == len(ashape)
for rdim, adim in zip(rshape, ashape):
assert isinstance(rdim, int)
assert isinstance(adim, (int, _AbstractDim))
self.memory = memory_space
self.serial_number = serial_number
self.dtype = dtype
self.rshape = rshape
self.ashape = ashape
self.input_index = input_index
self.output_index = output_index
@property
def ndim(self):
return len(self.ashape)
@property
def is_base(self):
return self.serial_number == self.memory.base_serial_number
@property
def is_input(self):
return self.input_index is not None
@property
def is_output(self):
return self.output_index is not None
@property
def var_name(self):
# The name of varialbe stored in global memory space.
raise NotImplementedError
@property
def lvar_name(self):
# The name of varialbe stored in registers in each thread.
raise NotImplementedError
@property
def indexer_name(self):
"""The name of CUDA CIndxer variable for the variable.
"""
# TODO(asi1024): Unify indexer with other variables which have the
# same shape, for performance improvements.
return 'ind{}_{}'.format(self.memory.id, self.serial_number)
def format(self, form, **kwargs):
"""Returns a string following the format taken as an input.
"""
kwargs = dict([
(k, get_typename(v) if isinstance(v, numpy.dtype) else v)
for k, v in kwargs.items()]
)
return string.Template(form).substitute(
type=get_typename(self.dtype),
var=self.var_name,
lvar=self.lvar_name,
indexer=self.indexer_name,
**kwargs
)
def __hash__(self):
assert False, (
'__hash__ is not defined. Use _VariableSet instead of '
'set/dict because they do not guarantee the order of contents.')
class _TraceScalar(_TraceVariable):
"""An abstracted scalar object.
Attributes:
const_value(scalar object or None): A compile-time constant value.
Actually, it is `None` iff self.is_input is `True`.
"""
# TODO(asi1024): Remove index argument.
def __init__(
self, index, serial_number, dtype, input_index=None, *,
const_value=None,):
super().__init__(
index, serial_number, dtype, (), (), input_index, None)
self.const_value = const_value
@property
def var_name(self):
if self.const_value is None:
return 'a{}'.format(self.memory.id)
if self.dtype == '?':
return str(self.const_value).lower()
if self.dtype.kind == 'c':
return '{}({}, {})'.format(
get_typename(self.dtype),
self.const_value.real,
self.const_value.imag)
return str(self.const_value)
@property
def lvar_name(self):
return 'v{}'.format(self.memory.id)
def as_interface(self):
return _fusion_interface._ScalarProxy(self)
def key(self):
return (self.memory.id,)
class _TraceArray(_TraceVariable):
"""An abstracted array object.
Attributes:
broadcasted_from(_TraceArray optional): TODO
rotated_from(_TraceArray optional): TODO
axis(int optional): The axis to rotate.
indexed_from(_TraceArray optional): TODO
index_key(slice): TODO
"""
def __init__(
self, index, serial_number, dtype, input_index=None,
output_index=None, *, rshape, ashape, **kwargs):
if ashape is None:
assert input_index is not None
ndim = len(rshape)
ashape = tuple([
_AbstractDim(input_index, axis) for axis in range(ndim)])
super().__init__(
index, serial_number, dtype, rshape, ashape,
input_index, output_index)
self._view_of = None
self.is_broadcast = False
self.rotate_axis = None
self.slice_key = None
if 'broadcasted_from' in kwargs:
self._view_of = kwargs.pop('broadcasted_from')
self.is_broadcast = True
elif 'rotated_from' in kwargs:
self._view_of = kwargs.pop('rotated_from')
self.rotate_axis = kwargs.pop('axis')
elif 'indexed_from' in kwargs:
self._view_of = kwargs.pop('indexed_from')
self.slice_key = kwargs.pop('index_key')
assert len(kwargs) == 0, kwargs
@property
def var_name(self):
return 'a{}_{}'.format(self.memory.id, self.serial_number)
@property
def lvar_name(self):
return 'v{}_{}'.format(self.memory.id, self.serial_number)
def as_interface(self):
return _fusion_interface._ArrayProxy(self)
def make_view(self, serial_number, **kwargs):
rshape = kwargs.pop('rshape', self.rshape)
ashape = kwargs.pop('ashape', self.ashape)
return _TraceArray(
self.memory, serial_number, self.dtype,
rshape=rshape, ashape=ashape, **kwargs)
def key(self):
"""Two variables can be identified if they have the same key.
"""
if isinstance(self.slice_key, tuple):
slice_key = []
for s in self.slice_key:
if isinstance(s, slice):
if not (s.start is None
and s.stop is None
and s.step in (None, 1, -1)):
raise NotImplementedError(
'Basic slice supports only x[::] and x[::-1].')
slice_key.append((s.start, s.stop, s.step))
else:
slice_key.append(s)
slice_key = tuple(slice_key)
else:
slice_key = self.slice_key
return (
self.memory.id, self.ashape, self.input_index,
getattr(self._view_of, 'serial_number', None),
self.is_broadcast, self.rotate_axis, slice_key,
)
class _VariableSet:
"""A stable set of variables
"""
def __init__(self, *args):
self.contents = []
for x in args:
assert isinstance(x, _TraceVariable)
if x not in self.contents:
self.contents.append(x)
def __len__(self):
return len(self.contents)
def item(self):
assert len(self.contents) == 1
return self.contents[0]
def add(self, x):
if x not in self.contents:
self.contents.append(x)
def __iadd__(self, other):
assert isinstance(other, _VariableSet)
for x in other.contents:
self.add(x)
return self
def __add__(self, other):
res = _VariableSet(*self.contents)
res += other
return res
def __contains__(self, elem):
return elem in self.contents
def __iter__(self):
return iter(self.contents)
def __isub__(self, other):
assert isinstance(other, _VariableSet)
for x in other.contents:
if x in self.contents:
self.contents.remove(x)
return self
def __sub__(self, other):
res = _VariableSet(*self.contents)
res -= other
return res
This diff is collapsed.
from libcpp cimport vector
from cupy._core cimport _carray
from cupy._core cimport _scalar
from cupy._core._carray cimport shape_t
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport memory
from cupy.cuda cimport texture
cdef class ParameterInfo:
cdef:
readonly str name
readonly object dtype
readonly str ctype
readonly bint raw
readonly bint is_const
cdef enum _ArgKind:
ARG_KIND_NDARRAY = 1
ARG_KIND_INDEXER
ARG_KIND_SCALAR
ARG_KIND_POINTER
ARG_KIND_TEXTURE
cdef class _ArgInfo:
# Holds metadata of an argument.
# This class is immutable and used as a part of hash keys.
cdef:
readonly _ArgKind arg_kind
readonly type type
readonly object dtype
readonly int ndim
readonly bint c_contiguous
readonly bint index_32_bits
cdef _ArgInfo _init(
self,
_ArgKind arg_kind,
type typ,
object dtype,
int ndim,
bint c_contiguous,
bint index_32_bits)
@staticmethod
cdef _ArgInfo from_arg(object arg)
@staticmethod
cdef _ArgInfo from_ndarray(_ndarray_base arg)
@staticmethod
cdef _ArgInfo from_scalar(_scalar.CScalar arg)
@staticmethod
cdef _ArgInfo from_indexer(_carray.Indexer arg)
@staticmethod
cdef _ArgInfo from_memptr(memory.MemoryPointer arg)
@staticmethod
cdef _ArgInfo from_texture(texture.TextureObject arg)
cdef _ArgInfo as_ndarray_with_ndim(self, int ndim)
cdef bint is_ndarray(self)
cdef bint is_scalar(self)
cdef str get_c_type(self)
cdef str get_param_c_type(self, ParameterInfo p)
cdef str get_c_var_name(self, ParameterInfo p)
cdef class _TypeMap:
# Typedef mapping between C types.
# This class is immutable.
cdef:
tuple _pairs
cdef str get_typedef_code(self)
cdef class _Op:
"""Simple data structure that represents a kernel routine with single \
concrete dtype mapping.
"""
cdef:
readonly tuple in_types
readonly tuple out_types
readonly int nin
readonly int nout
readonly object routine
# If the type combination specified by in_types and out_types is
# disallowed, error_func must be set instead of routine.
# It's called by check_valid() method.
readonly object error_func
@staticmethod
cdef _Op _from_type_and_routine_or_error_func(
str typ, object routine, object error_func)
# Creates an op instance parsing a dtype mapping.
@staticmethod
cdef _Op from_type_and_routine(str typ, routine)
cpdef tuple get_in_dtypes(self)
cpdef tuple get_out_dtypes(self)
# Creates an op instance parsing a dtype mapping with given error function.
@staticmethod
cdef _Op from_type_and_error_func(str typ, error_func)
# Raises an error if error_func is given.
cdef check_valid(self)
cdef class _Ops:
"""A kernel routine representation with various dtype mappings.
"""
cdef:
readonly tuple ops
readonly int nin
readonly int nout
@staticmethod
cdef _Ops from_tuples(object ops, routine)
# Queries a single op from input arguments.
cpdef _Op guess_routine(
self, str name, dict cache, list in_args, dtype, _Ops out_ops)
cpdef _Op _guess_routine_from_in_types(
self, tuple in_types, object can_cast=*)
cpdef _Op _guess_routine_from_dtype(self, object dtype)
cpdef create_ufunc(name, ops, routine=*, preamble=*, doc=*,
default_casting=*, loop_prep=*, out_ops=*,
cutensor_op=*, scatter_op=*)
cdef tuple _get_arginfos(list args)
cdef str _get_kernel_params(tuple params, tuple arginfos)
cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape)
cdef list _get_out_args_from_optionals(
subtype, list out_args, tuple out_types, const shape_t& out_shape, casting,
obj)
cdef list _get_out_args_with_params(
list out_args, tuple out_types,
const shape_t& out_shape, tuple out_params, bint is_size_specified)
cdef _check_peer_access(_ndarray_base arr, int device_id)
cdef list _preprocess_args(int dev_id, args, bint use_c_scalar)
cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape)
This diff is collapsed.
from cupy._core.core cimport _ndarray_base
from libcpp.pair cimport pair
cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array)
cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b)
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport memory
from libcpp.pair cimport pair
cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array):
cdef Py_ssize_t left = array.data.ptr
cdef Py_ssize_t right = left
cdef Py_ssize_t tmp
cdef pair[Py_ssize_t, Py_ssize_t] ret
cdef size_t i
for i in range(array._shape.size()):
# shape[i] != 0 is assumed
tmp = (array._shape[i] - 1) * array._strides[i]
if tmp > 0:
right += tmp
else:
left += tmp
ret.first = left
ret.second = right + <Py_ssize_t>array.dtype.itemsize
return ret
cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b):
cdef memory.MemoryPointer a_data = a.data
cdef memory.MemoryPointer b_data = b.data
cdef pair[Py_ssize_t, Py_ssize_t] a_range, b_range
if (a_data.device_id != b_data.device_id
or a_data.mem.ptr != b_data.mem.ptr
or a.size == 0 or b.size == 0):
return False
a_range = get_bound(a)
b_range = get_bound(b)
return a_range.first < b_range.second and b_range.first < a_range.second
cdef object _thread_local
cdef dict _contexts
cdef class _OptimizationConfig:
cdef readonly object optimize_impl
cdef readonly int max_trials
cdef readonly float timeout
cdef readonly float expected_total_time_per_trial
cdef readonly float max_total_time_per_trial
cdef class _OptimizationContext:
cdef readonly str key
cdef readonly _OptimizationConfig config
cdef readonly dict _params_map
cdef readonly bint _dirty
cpdef _OptimizationContext get_current_context()
import pickle
import threading
cdef _thread_local = threading.local()
cdef _contexts = {}
cdef class _OptimizationConfig:
def __init__(
self, optimize_impl, *,
int max_trials=100,
float timeout=1,
float expected_total_time_per_trial=100 * 1e-6,
float max_total_time_per_trial=0.1):
self.optimize_impl = optimize_impl
self.max_trials = max_trials
self.timeout = timeout
self.expected_total_time_per_trial = expected_total_time_per_trial
self.max_total_time_per_trial = max_total_time_per_trial
cdef class _OptimizationContext:
def __init__(self, str key, _OptimizationConfig config):
self.key = key
self.config = config
self._params_map = {}
self._dirty = False
def get_params(self, key):
return self._params_map.get(key)
def set_params(self, key, params):
self._params_map[key] = params
self._dirty = True
def save(self, filepath):
with open(filepath, mode='wb') as f:
pickle.dump((self.key, self._params_map), f)
self._dirty = False
def load(self, filepath):
with open(filepath, mode='rb') as f:
key, params_map = pickle.load(f)
if key != self.key:
raise ValueError(
'Optimization key mismatch {} != {}'.format(key, self.key))
self._params_map = params_map
self._dirty = False
def _is_dirty(self):
return self._dirty
cpdef _OptimizationContext get_current_context():
try:
return _thread_local.current_context
except AttributeError:
return None
def set_current_context(_OptimizationContext context):
_thread_local.current_context = context
def get_new_context(
str key, object optimize_impl, dict config_dict):
c = _contexts.get(key)
if c is None:
config = _OptimizationConfig(optimize_impl, **config_dict)
c = _OptimizationContext(key, config)
_contexts[key] = c
return c
def _clear_all_contexts_cache():
global _contexts
assert get_current_context() is None
_contexts = {}
from cupy._core._carray cimport shape_t
from cupy._core cimport _kernel
from cupy._core.core cimport _ndarray_base
from cupy.cuda cimport function
cdef Py_ssize_t _block_size
cpdef tuple _get_axis(object axis, Py_ssize_t ndim)
cpdef shape_t _get_out_shape(
const shape_t& shape, tuple reduce_axis, tuple out_axis, bint keepdims)
cdef class _AbstractReductionKernel:
cdef:
readonly str name
public str identity
readonly tuple in_params
readonly tuple out_params
readonly tuple _params
readonly str __name__
readonly dict _cached_codes
cpdef _ndarray_base _call(
self,
list in_args, list out_args,
const shape_t& a_shape, axis, dtype,
bint keepdims, bint reduce_dims, int device_id,
stream, bint try_use_cub=*, bint sort_reduce_axis=*)
cdef void _launch(
self, out_block_num, block_size, block_stride,
in_args, out_args, in_shape, out_shape, types,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params)
cdef tuple _get_expressions_and_types(
self, list in_args, list out_args, dtype)
cdef list _get_out_args(
self, list out_args, tuple out_types, const shape_t& out_shape)
cdef function.Function _get_function(
self,
tuple params, tuple arginfos, _kernel._TypeMap types,
str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
Py_ssize_t block_size)
cdef class ReductionKernel(_AbstractReductionKernel):
cdef:
readonly int nin
readonly int nout
readonly int nargs
readonly tuple params
readonly str reduce_expr
readonly str map_expr
readonly str post_map_expr
readonly object options
readonly bint reduce_dims
readonly object reduce_type
readonly str preamble
cdef shape_t _set_permuted_args(
list args, tuple axis_permutes, const shape_t& shape, tuple params)
cdef tuple _get_shape_and_strides(list in_args, list out_args)
cdef _optimizer_copy_arg(a)
cpdef create_reduction_func(
name, ops, routine=*, identity=*, preamble=*, sort_reduce_axis=*)
This diff is collapsed.
cdef object _bitwise_and
cdef object _bitwise_or
cdef object _bitwise_xor
cdef object _invert
cdef object _left_shift
cdef object _right_shift
from cupy._core._kernel import create_ufunc
cdef _create_bit_op(name, op, no_bool, doc='', scatter_op=None):
types = () if no_bool else ('??->?',)
return create_ufunc(
'cupy_' + name,
types + ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
'LL->L', 'qq->q', 'QQ->Q'),
'out0 = in0 %s in1' % op,
doc=doc, scatter_op=scatter_op)
cdef _bitwise_and = _create_bit_op(
'bitwise_and', '&', False,
'''Computes the bitwise AND of two arrays elementwise.
Only integer and boolean arrays are handled.
.. seealso:: :data:`numpy.bitwise_and`
''',
scatter_op='and')
cdef _bitwise_or = _create_bit_op(
'bitwise_or', '|', False,
'''Computes the bitwise OR of two arrays elementwise.
Only integer and boolean arrays are handled.
.. seealso:: :data:`numpy.bitwise_or`
''',
scatter_op='or')
cdef _bitwise_xor = _create_bit_op(
'bitwise_xor', '^', False,
'''Computes the bitwise XOR of two arrays elementwise.
Only integer and boolean arrays are handled.
.. seealso:: :data:`numpy.bitwise_xor`
''',
scatter_op='xor')
cdef _invert = create_ufunc(
'cupy_invert',
(('?->?', 'out0 = !in0'), 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I',
'l->l', 'L->L', 'q->q', 'Q->Q'),
'out0 = ~in0',
doc='''Computes the bitwise NOT of an array elementwise.
Only integer and boolean arrays are handled.
.. note::
:func:`cupy.bitwise_not` is an alias for :func:`cupy.invert`.
.. seealso:: :data:`numpy.invert`
''')
cdef _left_shift = _create_bit_op(
'left_shift', '<<', True,
'''Shifts the bits of each integer element to the left.
Only integer arrays are handled.
.. seealso:: :data:`numpy.left_shift`
''')
cdef _right_shift = _create_bit_op(
'right_shift', '>>', True,
'''Shifts the bits of each integer element to the right.
Only integer arrays are handled
.. seealso:: :data:`numpy.right_shift`
''')
# Variables to expose to Python
# (cythonized data cannot be exposed to Python, even with cpdef.)
bitwise_and = _bitwise_and
bitwise_or = _bitwise_or
bitwise_xor = _bitwise_xor
invert = _invert
left_shift = _left_shift
right_shift = _right_shift
from cupy._core.core cimport _ndarray_base
cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self)
cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices)
cdef _ndarray_setitem(_ndarray_base self, slices, value)
cdef tuple _ndarray_nonzero(_ndarray_base self)
cdef _scatter_op(_ndarray_base a, slices, value, op)
cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out)
cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode)
cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode)
cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out)
cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2)
cdef _ndarray_base _add_reduceat(
_ndarray_base array, indices, axis, dtype, out)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment