clone code from github

deb763b7 · root · 93bf084b · deb763b7 · deb763b7 · deb763b7
Commit deb763b7 authored Feb 06, 2026 by root
20 changed files
--- a/cupy/_core/_fusion_interface.py
+++ b/cupy/_core/_fusion_interface.py
+import numpy
+from cupy._core._dtype import get_dtype
+import cupy
+from cupy._core import _fusion_thread_local
+from cupy._core import core
+from cupy._core._scalar import get_typename
+_thread_local = _fusion_thread_local.thread_local
+_dtype_to_astype_dict = None
+def _set_dtype_to_astype_dict():
+    """Set a dict with dtypes and astype ufuncs to `_dtype_to_astype_dict`.
+    Creates a ufunc for type cast operations, and set a dict with keys
+    as the dtype of the output array and values as astype ufuncs.
+    This function is called at most once.
+    """
+    global _dtype_to_astype_dict
+    _dtype_to_astype_dict = {}
+    dtype_list = [numpy.dtype(type_char) for type_char in '?bhilqBHILQefdFD']
+    for t in dtype_list:
+        name = 'astype_{}'.format(t)
+        rules = tuple(['{}->{}'.format(s.char, t.char) for s in dtype_list])
+        command = 'out0 = static_cast< {} >(in0)'.format(get_typename(t))
+        _dtype_to_astype_dict[t] = core.create_ufunc(name, rules, command)
+class _VariableProxy:
+    """Abstracted array/scalar object passed to the target function.
+    """
+    def __init__(self, content):
+        assert isinstance(content, cupy._core._fusion_variable._TraceVariable)
+        self.content = content
+    def __neg__(self):
+        return cupy.negative(self)
+    def __add__(self, other):
+        return cupy.add(self, other)
+    def __radd__(self, other):
+        return cupy.add(other, self)
+    def __sub__(self, other):
+        return cupy.subtract(self, other)
+    def __rsub__(self, other):
+        return cupy.subtract(other, self)
+    def __mul__(self, other):
+        return cupy.multiply(self, other)
+    def __rmul__(self, other):
+        return cupy.multiply(other, self)
+    def __div__(self, other):
+        return cupy.divide(self, other)
+    def __rdiv__(self, other):
+        return cupy.divide(other, self)
+    def __truediv__(self, other):
+        return cupy.true_divide(self, other)
+    def __rtruediv__(self, other):
+        return cupy.true_divide(other, self)
+    def __floordiv__(self, other):
+        return cupy.floor_divide(self, other)
+    def __rfloordiv__(self, other):
+        return cupy.floor_divide(other, self)
+    def __mod__(self, other):
+        return cupy.remainder(self, other)
+    def __rmod__(self, other):
+        return cupy.remainder(other, self)
+    def __pow__(self, other):
+        return cupy.power(self, other)
+    def __lshift__(self, other):
+        return cupy.left_shift(self, other)
+    def __rlshift__(self, other):
+        return cupy.left_shift(other, self)
+    def __rshift__(self, other):
+        return cupy.right_shift(self, other)
+    def __rrshift__(self, other):
+        return cupy.right_shift(other, self)
+    def __invert__(self):
+        return cupy.invert(self)
+    def __and__(self, other):
+        return cupy.bitwise_and(self, other)
+    def __rand__(self, other):
+        return cupy.bitwise_and(other, self)
+    def __or__(self, other):
+        return cupy.bitwise_or(self, other)
+    def __ror__(self, other):
+        return cupy.bitwise_or(other, self)
+    def __xor__(self, other):
+        return cupy.bitwise_xor(self, other)
+    def __rxor__(self, other):
+        return cupy.bitwise_xor(other, self)
+    def __lt__(self, other):
+        return cupy.less(self, other)
+    def __le__(self, other):
+        return cupy.less_equal(self, other)
+    def __eq__(self, other):
+        return cupy.equal(self, other)
+    def __ne__(self, other):
+        return cupy.not_equal(self, other)
+    def __ge__(self, other):
+        return cupy.greater_equal(self, other)
+    def __gt__(self, other):
+        return cupy.greater(self, other)
+    def copy(self):
+        return cupy.copy(self)
+    def astype(self, dtype, order=None, casting=None, subok=None, copy=True):
+        dtype = get_dtype(dtype)
+        if order is not None:
+            raise TypeError('order is not supported yet')
+        if casting is not None:
+            raise TypeError('casting is not supported yet')
+        if subok is not None:
+            raise TypeError('subok is not supported yet')
+        if not copy and self.dtype == dtype:
+            return self
+        if _dtype_to_astype_dict is None:
+            _set_dtype_to_astype_dict()
+        return _dtype_to_astype_dict[dtype](self)
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):
+        return cupy.sum(
+            self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):
+        return cupy.prod(
+            self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
+    def max(self, axis=None, out=None, keepdims=False):
+        return cupy.max(self, axis=axis, out=out, keepdims=keepdims)
+    def min(self, axis=None, out=None, keepdims=False):
+        return cupy.min(self, axis=axis, out=out, keepdims=keepdims)
+    def all(self, axis=None, out=None, keepdims=False):
+        return cupy.all(self, axis=axis, out=out, keepdims=keepdims)
+    def any(self, axis=None, out=None, keepdims=False):
+        return cupy.any(self, axis=axis, out=out, keepdims=keepdims)
+    @property
+    def dtype(self):
+        return self.content.dtype
+    @property
+    def ndim(self):
+        return self.content.ndim
+    @property
+    def shape(self):
+        raise NotImplementedError('`shape` is not supported, currently.')
+class _ScalarProxy(_VariableProxy):
+    """An abstracted scalar object passed to the target function.
+    Attributes:
+        dtype(dtype): The dtype of the array.
+        imag(_ArrayProxy): The imaginary part of the array (Not implemented)
+        real(_ArrayProxy): The real part of the array (Not implemented)
+        ndim(int): The number of dimensions of the array.
+    """
+    def __repr__(self):
+        return '_ScalarProxy({}, dtype={})'.format(
+            self._emit_param_name(), self.dtype)
+class _ArrayProxy(_VariableProxy):
+    """An abstracted array object passed to the target function.
+    Attributes:
+        dtype(dtype): The dtype of the array.
+        imag(_ArrayProxy): The imaginary part of the array (Not implemented)
+        real(_ArrayProxy): The real part of the array (Not implemented)
+        ndim(int): The number of dimensions of the array.
+    """
+    def __repr__(self):
+        return '_ArrayProxy([...], dtype=\'{}\', ndim={})'.format(
+            self.dtype.char, self.ndim)
+    def _inplace_op(self, ufunc, other):
+        return ufunc(self, other, self)
+    def __iadd__(self, other):
+        return self._inplace_op(cupy.add, other)
+    def __isub__(self, other):
+        return self._inplace_op(cupy.subtract, other)
+    def __imul__(self, other):
+        return self._inplace_op(cupy.multiply, other)
+    def __idiv__(self, other):
+        return self._inplace_op(cupy.divide, other)
+    def __itruediv__(self, other):
+        return self._inplace_op(cupy.true_divide, other)
+    def __ifloordiv__(self, other):
+        return self._inplace_op(cupy.floor_divide, other)
+    def __imod__(self, other):
+        return self._inplace_op(cupy.remainder, other)
+    def __ipow__(self, other):
+        return self._inplace_op(cupy.power, other)
+    def __ilshift__(self, other):
+        return self._inplace_op(cupy.left_shift, other)
+    def __irshift__(self, other):
+        return self._inplace_op(cupy.right_shift, other)
+    def __iand__(self, other):
+        return self._inplace_op(cupy.bitwise_and, other)
+    def __ior__(self, other):
+        return self._inplace_op(cupy.bitwise_or, other)
+    def __ixor__(self, other):
+        return self._inplace_op(cupy.bitwise_xor, other)
+    def __getitem__(self, index):
+        return _fusion_thread_local.call_indexing(self, index)
+    def __setitem__(self, slices, value):
+        if slices is Ellipsis or (
+                isinstance(slices, slice) and slices == slice(None)):
+            _fusion_thread_local.call_ufunc(
+                core.elementwise_copy, value, out=self)
+        else:
+            raise ValueError('The fusion supports `[...]` or `[:]`.')
--- a/cupy/_core/_fusion_kernel.pyx
+++ b/cupy/_core/_fusion_kernel.pyx
+import itertools
+import string
+from libcpp cimport vector
+from cupy._core cimport _carray
+from cupy._core.core cimport _ndarray_init
+from cupy._core.core cimport compile_with_cache
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport internal
+from cupy._core cimport _routines_manipulation as _manipulation
+from cupy_backends.cuda.api cimport driver
+from cupy_backends.cuda.api cimport runtime
+import cupy as _cupy
+from cupy._core import _dtype
+from cupy import _util
+from cupy._core import _codeblock
+from cupy._core import _fusion_op
+from cupy._core._fusion_variable import _TraceVariable
+from cupy._core._fusion_variable import _TraceScalar
+from cupy._core._fusion_variable import _TraceArray
+cdef Py_ssize_t _default_block_size = (
+    256 if runtime._is_hip_environment else 512)
+@_util.memoize(for_each_device=True)
+def _cuda_compile(preamble, name, cuda_params, cuda_body, use_grid_sync):
+    template = (
+        '${preamble}\n\n'
+        'extern "C" __global__ void ${name}(${cuda_params}) ${cuda_body}\n'
+    )
+    if use_grid_sync:
+        template = '#include <cooperative_groups.h>\n\n' + template
+    code = string.Template(template).substitute(
+        preamble=preamble,
+        name=name,
+        cuda_params=cuda_params,
+        cuda_body=cuda_body)
+    # (For contributers) We can view the whole generated CUDA code
+    # by uncommenting the following line.
+    # print(code)
+    module = compile_with_cache(
+        code, (), None, None, True, 'nvrtc', False, use_grid_sync)
+    return module.get_function(name)
+cdef class FusedKernel:
+    cdef:
+        readonly object shape_constraints
+        readonly str _name
+        readonly list _params
+        readonly int _return_size
+        readonly str _submodule_code
+        readonly str _cuda_body
+        readonly dict _cuda_params_memo
+        readonly list _block_strides
+        readonly bint _use_grid_sync
+        readonly list _reduction_in_array
+        readonly list _reduction_out_array
+        readonly vector.vector[bint] _is_base
+        readonly list _dtypes
+        readonly vector.vector[Py_ssize_t] _input_index
+        readonly vector.vector[Py_ssize_t] _view_of
+        readonly vector.vector[Py_ssize_t] _out_params
+    def __init__(self, name, trace_result):
+        op_list = trace_result.op_list
+        params = trace_result.params
+        return_size = trace_result.return_size
+        self.shape_constraints = trace_result.shape_constraints
+        self._name = name
+        self._params = sorted(params, key=lambda x: x.serial_number)
+        self._cuda_params_memo = {}
+        # Generate the device functions.
+        submodule_code = '\n\n'.join(set(itertools.chain.from_iterable([
+            op.emit_preamble_codes() for op in op_list]))) + '\n\n'
+        submodule_code += '\n\n'.join(itertools.chain.from_iterable([
+            op.emit_submodule_codes() for op in op_list]))
+        # Generate the function body of a __global__ function.
+        codes = []
+        self._use_grid_sync = len(op_list) > 1
+        if self._use_grid_sync:
+            codes.append('namespace _cg = cooperative_groups;')
+            codes.append('_cg::grid_group _grid = _cg::this_grid();')
+        for i, op in enumerate(op_list):
+            if i > 0:
+                codes.append('_cg::sync(_grid);')
+            codes.append(op.emit_code())
+        self._submodule_code = submodule_code
+        self._cuda_body = str(_codeblock.CodeBlock('', codes))
+        # Check the format of the return value.
+        if return_size == 'none':
+            self._return_size = -1
+            self._out_params.resize(0)
+        elif return_size == 'single':
+            self._return_size = -2
+            self._out_params.resize(1)
+        else:
+            assert isinstance(return_size, int)
+            assert return_size >= 0
+            self._return_size = return_size
+            self._out_params.resize(return_size)
+        for p in self._params:
+            assert isinstance(p, _TraceVariable)
+        # Analyse the relationship between variables.
+        array_dict = {}
+        self._reduction_in_array = []
+        self._reduction_out_array = []
+        self._dtypes = []
+        for i, p in enumerate(self._params):
+            view_of = -1
+            input_index = -1
+            if p.input_index is not None:
+                input_index = p.input_index
+            if isinstance(p, _TraceArray):
+                if p._view_of is not None:
+                    view_of = array_dict[p._view_of.key()]
+                if p.is_output:
+                    self._out_params[p.output_index] = i
+            array_dict[p.key()] = i
+            self._is_base.push_back(p.is_base)
+            self._dtypes.append(_dtype.get_dtype(p.dtype))
+            self._input_index.push_back(input_index)
+            self._view_of.push_back(view_of)
+        self._block_strides = []
+        for op in op_list:
+            if isinstance(op, _fusion_op._ReductionTraceOp):
+                self._reduction_in_array.append(
+                    array_dict[op.in_params.item().key()])
+                self._reduction_out_array.append(
+                    array_dict[op.out_params.item().key()])
+                self._block_strides.append(
+                    'int {}'.format(op.block_stride_name))
+    def get_shapes_of_kernel_params(self, tuple args):
+        """Returns the shapes of parameters passed to kern.linear_launch.
+        """
+        cdef list kernel_param_shapes = []
+        cdef int axis
+        cdef list shape
+        for param in self._params:
+            shape = []
+            if isinstance(param, _TraceArray):
+                ashape = param.ashape
+                for axis in range(len(ashape)):
+                    dim = ashape[axis]
+                    if not isinstance(dim, int):
+                        dim = args[dim.input_index].shape[dim.axis]
+                    shape.append(dim)
+            kernel_param_shapes.append(tuple(shape))
+        return kernel_param_shapes
+    cdef list _get_ndarray_list(self, tuple args, list shapes):
+        """Get the list of ndarray corresponding to ``self._params``.
+        """
+        cdef list ndarray_list = []
+        cdef list params = self._params
+        cdef int i
+        for i in range(len(params)):
+            param = params[i]
+            shape = shapes[i]
+            if self._input_index[i] >= 0:
+                array = args[<Py_ssize_t>self._input_index[i]]
+            elif isinstance(param, _TraceScalar):
+                array = None
+            elif self._is_base[i]:
+                array = _ndarray_init(
+                    _cupy.ndarray, shape, self._dtypes[i], None)
+            else:
+                view_of = ndarray_list[<Py_ssize_t>self._view_of[i]]
+                if param.is_broadcast:
+                    array = _manipulation.broadcast_to(view_of, shape)
+                elif param.slice_key is not None:
+                    array = view_of[param.slice_key]
+                elif param.rotate_axis is not None:
+                    axis_permutes = list(param.rotate_axis)
+                    for i in range(param.ndim):
+                        if i not in param.rotate_axis:
+                            axis_permutes.append(i)
+                    axis_permutes = tuple(axis_permutes)
+                    array = _manipulation._transpose(view_of, axis_permutes)
+                else:
+                    assert False
+            # For debug
+            # if isinstance(array, ndarray) and param.rotate_axis is None:
+            #     assert array.shape == shape, (array.shape, shape)
+            ndarray_list.append(array)
+        return ndarray_list
+    cdef object _get_return_value(self, list ndarray_list):
+        """Get the return value of ``self.execute``.
+        """
+        cdef int i
+        if self._return_size == -1:
+            return None
+        if self._return_size == -2:
+            return ndarray_list[<Py_ssize_t>self._out_params[0]]
+        return tuple([
+            ndarray_list[<Py_ssize_t>self._out_params[i]]
+            for i in range(self._return_size)
+        ])
+    cdef tuple _get_kernel_size(self, list ndarray_list):
+        """Calculate the numnber of contiguous blocks in non-reduction axes
+        of input arrays, and set them to ``self._contiguous_size``.
+        """
+        cdef _ndarray_base in_array, out_array
+        cdef Py_ssize_t block_size, block_stride, contiguous_size
+        cdef list block_strides = []
+        if len(self._reduction_in_array) == 0:
+            return [], 256, 0
+        block_size = _default_block_size
+        for i in range(len(self._reduction_in_array)):
+            in_array = ndarray_list[self._reduction_in_array[i]]
+            out_array = ndarray_list[self._reduction_out_array[i]]
+            # TODO(asi1024): Fix block strides for performance.
+            contiguous_size = 1
+            itemsize = in_array.dtype.itemsize
+            for i in range(out_array.ndim):
+                if in_array.strides[-i-1] != contiguous_size * itemsize:
+                    break
+                contiguous_size *= in_array.shape[-i-1]
+            contiguous_size = min(contiguous_size, 32)
+            reduce_block_size = max(1, in_array.size // max(1, out_array.size))
+            block_stride = max(
+                contiguous_size, block_size // reduce_block_size)
+            block_stride = internal.clp2(block_stride // 2 + 1)  # floor
+            block_strides.append(block_stride)
+        shared_mem = block_size * 32  # max bytesize of reduce_ctype.
+        return block_strides, block_size, shared_mem
+    cdef tuple _reduce_dims(self, list ndarray_list):
+        """Reduce number of dimensions of ndarrays and returns the cache key.
+        """
+        cdef list params = self._params
+        cdef list ndims = []
+        cdef _ndarray_base array
+        cdef int i
+        for i in range(len(params)):
+            param = params[i]
+            if param.ndim <= 1:
+                continue
+            array = ndarray_list[i]
+            array = array.reduced_view()
+            ndarray_list[i] = array
+            ndims.append(array.ndim)
+        return tuple(ndims)
+    cdef list _get_inout_args(self, tuple args, list ndarray_list):
+        """Get the arguments passed to ``kern.linear_launch``.
+        """
+        cdef list params = []
+        cdef list indexers = []
+        cdef _carray.Indexer indexer
+        for i in range(len(self._params)):
+            array = ndarray_list[i]
+            if isinstance(array, _ndarray_base):
+                indexer = _carray.Indexer.__new__(_carray.Indexer)
+                indexer.init(array._shape)
+                indexers.append(indexer)
+                params.append(array)
+            elif self._input_index[i] >= 0:
+                obj = args[<Py_ssize_t>self._input_index[i]]
+                params.append(obj)
+        return params + indexers
+    cdef str _get_cuda_params(self, tuple key, list ndarray_list):
+        """Get a string of parameters of CUDA main function code.
+        """
+        cdef int i
+        if key in self._cuda_params_memo:
+            return self._cuda_params_memo[key]
+        cuda_params = []
+        indexers = []
+        for i in range(len(self._params)):
+            a = self._params[i]
+            if isinstance(a, _TraceArray):
+                array = ndarray_list[i]
+                ndim = array.ndim
+                c_contiguous = 'true' if array._c_contiguous else 'false'
+                index_32_bits = 'true' if array._index_32_bits else 'false'
+                cuda_params.append(a.format(
+                    'CArray<${type}, ${ndim}, ${cont}, ${ind32}> ${var}',
+                    ndim=ndim, cont=c_contiguous, ind32=index_32_bits))
+                indexers.append(
+                    a.format('CIndexer<${ndim}> ${indexer}', ndim=ndim))
+            elif isinstance(a, _TraceScalar):
+                if a.const_value is None:
+                    cuda_params.append(a.format('${type} ${var}'))
+            else:
+                raise TypeError('Unknown type {}.'.format(type(a)))
+        ret = cuda_params + indexers + self._block_strides
+        ret = ', '.join(ret)
+        self._cuda_params_memo[key] = ret
+        return ret
+    def execute(self, tuple args, list shapes):
+        ndarray_list = self._get_ndarray_list(args, shapes)
+        ret = self._get_return_value(ndarray_list)
+        reduce_key = self._reduce_dims(ndarray_list)
+        inout_args = self._get_inout_args(args, ndarray_list)
+        cuda_params = self._get_cuda_params(reduce_key, ndarray_list)
+        kern = _cuda_compile(
+            self._submodule_code, self._name, cuda_params, self._cuda_body,
+            self._use_grid_sync)
+        block_strides, block_size, shared_mem = (
+            self._get_kernel_size(ndarray_list))
+        # TODO(asi1024): Optimize kernel size parameter.
+        if not runtime._is_hip_environment:
+            kern_size = driver.occupancyMaxActiveBlocksPerMultiprocessor(
+                kern.ptr, block_size, shared_mem) * block_size
+        else:
+            # In HIP sometimes the occupancy calc seems to be broken
+            kern_size = block_size * 512
+        kargs = inout_args + block_strides
+        kern.linear_launch(
+            kern_size, kargs, shared_mem, block_size,
+            enable_cooperative_groups=self._use_grid_sync)
+        return ret
--- a/cupy/_core/_fusion_op.py
+++ b/cupy/_core/_fusion_op.py
+import string
+import numpy
+from cupy._core import _codeblock
+from cupy._core._fusion_variable import _TraceVariable
+from cupy._core._fusion_variable import _TraceArray
+from cupy._core._fusion_variable import _VariableSet
+from cupy._core import _fusion_thread_local
+from cupy._core import _kernel
+from cupy._core import _reduction
+from cupy._core._scalar import get_typename
+class _UfuncRoutine:
+    """A device function for single elementwise operations.
+    """
+    def __init__(
+            self, name, ufunc, routine_code, in_params, out_params,
+            compute_dtypes):
+        assert isinstance(name, str)
+        assert isinstance(ufunc, _kernel.ufunc)
+        assert isinstance(routine_code, str)
+        assert isinstance(compute_dtypes, tuple)
+        assert all(isinstance(t, numpy.dtype) for t in compute_dtypes)
+        assert isinstance(in_params, list)
+        assert all(isinstance(p, _TraceVariable) for p in in_params)
+        assert isinstance(out_params, list)
+        assert all(isinstance(p, _TraceArray) for p in out_params)
+        self.name = name
+        self.in_params = in_params
+        self.out_params = out_params
+        self.preamble = ufunc._preamble
+        self.routine_code = routine_code
+        self.compute_dtypes = compute_dtypes
+    def emit_code(self):
+        """Returns a CUDA device function code.
+        Returns a string like:
+        ```
+        __device__ void cupy_add_0(int &in0_, float &in1_, double &out0_) {
+            typedef double in0_type;
+            typedef double in1_type;
+            typedef double out0_type;
+            double in0 = (double) in0_;
+            double in1 = (double) in1_;
+            double out0 = (double) out0_;
+            out0 = in0 + in1;
+            out0_ = out0;
+        }
+        ```
+        """
+        nin = len(self.in_params)
+        dtypes = self.compute_dtypes
+        assert len(self.in_params) == len(self.compute_dtypes[:nin])
+        in_params = [
+            (get_typename(p.dtype), get_typename(t), 'in{}'.format(i))
+            for i, (p, t) in enumerate(zip(self.in_params, dtypes[:nin]))
+        ]
+        out_params = [
+            (get_typename(p.dtype), get_typename(t), 'out{}'.format(i))
+            for i, (p, t) in enumerate(zip(self.out_params, dtypes[nin:]))
+        ]
+        params = in_params + out_params
+        params_code = ', '.join(['{} &{}_'.format(t, s) for t, _, s in params])
+        typedef = ['typedef {} {}_type;'.format(t, s) for _, t, s in params]
+        read = ['{} {} = ({}) {}_;'.format(t, s, t, s) for _, t, s in params]
+        write = ['{}_ = {};'.format(s, s) for _, _, s in out_params]
+        return _codeblock.CodeBlock(
+            '__device__ void {}({})'.format(self.name, params_code),
+            typedef + read + [self.routine_code + ';'] + write)
+    def emit_call_code(self):
+        params = self.in_params + self.out_params
+        return '{op_name}({params});'.format(
+            op_name=self.name,
+            params=', '.join([var.lvar_name for var in params]))
+class _ElementwiseTraceOp:
+    """Ufunc or elementwise kernel with types.
+    """
+    def __init__(self, ufunc_routines, in_params, out_params, ashape):
+        # The `in_params` and `out_params` should be already broadcasted to
+        # `ashape`, but they don't guarantee to be exactly same as
+        # `param.ashape`.
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(ufunc_routines, list)
+        assert all(isinstance(r, _UfuncRoutine) for r in ufunc_routines)
+        assert isinstance(ashape, tuple)
+        self.ops = ufunc_routines
+        self.in_params = _VariableSet(*in_params)
+        self.out_params = _VariableSet(*out_params)
+        self.ashape = ashape
+    @property
+    def params(self):
+        """Returns the set of all variable the loop uses.
+        """
+        res = _VariableSet()
+        for op in self.ops:
+            res += _VariableSet(*op.in_params)
+            res += _VariableSet(*op.out_params)
+        return res
+    @staticmethod
+    def _emit_declaration(params, in_params):
+        """Returns a tuple of size 2.
+        1. CUDA code: declaring local variables.
+            2. The set of arrays which require indexer.
+        """
+        _fusion_thread_local.check_not_runtime()
+        indexed_arrays = _VariableSet()
+        code = []
+        for var in params:
+            if var in in_params:
+                if isinstance(var, _TraceArray):
+                    indexed_arrays.add(var)
+                    f = '${type} ${lvar} = ${var}[${indexer}.get()];'
+                else:
+                    f = '${type} ${lvar} = ${var};'
+            else:
+                f = '${type} ${lvar};'
+            code.append(var.format(f))
+        return code, indexed_arrays
+    @staticmethod
+    def _emit_after_operation(out_params):
+        """Returns a tuple of size 2.
+        1. CUDA code: writing the results of operations back to global memory.
+        2. The set of arrays which require indexer.
+        """
+        _fusion_thread_local.check_not_runtime()
+        indexed_arrays = _VariableSet()
+        codes = []
+        for var in out_params:
+            if isinstance(var, _TraceArray):
+                indexed_arrays.add(var)
+                f = '${var}[${indexer}.get()] = ${lvar};'
+            else:
+                f = '${var} = ${lvar};'
+            codes.append(var.format(f))
+        return codes, indexed_arrays
+    @staticmethod
+    def _emit_set_index(indexed_params, tid):
+        """Returns a CUDA code: setting a raw index to indexers.
+        """
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(indexed_params, _VariableSet)
+        return [
+            p.format('${indexer}.set(${tid});', tid=tid)
+            for p in indexed_params
+        ]
+    def emit_code(self):
+        _fusion_thread_local.check_not_runtime()
+        declaration, s1 = self._emit_declaration(self.params, self.in_params)
+        operation = [op.emit_call_code() for op in self.ops]
+        after_operation, s2 = self._emit_after_operation(self.out_params)
+        index_name = 'i'
+        indexed_array = s1 + s2
+        indexer_name = next(iter(indexed_array)).indexer_name
+        indexer_setup = self._emit_set_index(indexed_array, index_name)
+        return _codeblock.CodeBlock(
+            'CUPY_FOR({}, {}.size())'.format(index_name, indexer_name),
+            indexer_setup + declaration + operation + after_operation)
+    def emit_preamble_codes(self):
+        return [subm.preamble for subm in self.ops if subm.preamble != '']
+    def emit_submodule_codes(self):
+        return [str(subm.emit_code()) for subm in self.ops]
+class _ReductionTraceOp:
+    def __init__(self, name, reduce_func, expr, in_param, out_param, axis):
+        """Reduction operation.
+        """
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(name, str)
+        assert isinstance(reduce_func, _reduction._SimpleReductionKernel)
+        assert isinstance(in_param, _TraceArray)
+        assert isinstance(out_param, _TraceArray)
+        assert isinstance(axis, tuple)
+        assert all(0 <= x < in_param.ndim for x in axis)
+        self.name = name
+        self.preamble = reduce_func.preamble
+        self.in_params = _VariableSet(in_param)
+        self.out_params = _VariableSet(out_param)
+        self.block_stride_name = 'block_stride_' + name
+        self.axis = axis
+        if reduce_func.identity is None:
+            self.identity = ''
+        else:
+            self.identity = str(reduce_func.identity)
+        _, self.expr, self.postmap_cast_code, self.reduce_ctype = expr
+        if self.reduce_ctype is None:
+            out_param, = self.out_params
+            self.reduce_ctype = get_typename(out_param.dtype)
+        self.premap_op = None
+        self.postmap_op = None
+    @property
+    def params(self):
+        return self.in_params + self.out_params
+    def emit_code(self):
+        _fusion_thread_local.check_not_runtime()
+        assert len(self.in_params) == 1
+        assert len(self.out_params) == 1
+        in_param = list(self.in_params)[0]
+        out_param = list(self.out_params)[0]
+        params = ', '.join([
+            in_param.var_name,
+            out_param.var_name,
+            in_param.indexer_name,
+            out_param.indexer_name,
+        ])
+        return '{}({}, {});'.format(
+            self.name, params, self.block_stride_name)
+    def emit_preamble_codes(self):
+        preamble = self.preamble
+        return [preamble] if preamble != '' else []
+    def emit_submodule_codes(self):
+        """Returns a CUDA device function code.
+        The emitted code assumes that ``block_stride`` and `blockDim.x` is a
+        power of 2.
+        """
+        in_param, = self.in_params
+        out_param, = self.out_params
+        op_name = '{}_op'.format(self.name)
+        postmap_name = '{}_postmap'.format(self.name)
+        template = string.Template('''
+#define ${op_name}(a, b) (${reduce_expr})
+#define ${postmap_name}(a, out0) (${postmap_cast})
+template <typename InType, typename OutType, typename InIndexerType, typename OutIndexerType>
+__device__ void ${name}(
+        InType in_arr, OutType out_arr,
+        InIndexerType in_ind, OutIndexerType out_ind, int block_stride) {
+    typedef ${in_type} type_in0_raw;
+    typedef ${out_type} type_out0_raw;
+    typedef ${reduce_ctype} _type_reduce;
+    extern __shared__ char _sdata_raw[];
+    _type_reduce *sdata = reinterpret_cast<_type_reduce*>(_sdata_raw);
+    unsigned int tid = threadIdx.x;
+    int _J = tid >> __popc(block_stride - 1);
+    ptrdiff_t _j = (ptrdiff_t)_J * out_ind.size();
+    int J_stride = blockDim.x >> __popc(block_stride - 1);
+    ptrdiff_t j_stride = (ptrdiff_t)J_stride * out_ind.size();
+    for (ptrdiff_t _i = (ptrdiff_t)blockIdx.x * block_stride; _i < out_ind.size(); _i += (ptrdiff_t)gridDim.x * block_stride) {
+        _type_reduce s = _type_reduce(${identity});
+        ptrdiff_t i = _i + (tid & (block_stride - 1));
+        for (ptrdiff_t j = i + _j; j < in_ind.size(); j += j_stride) {
+            in_ind.set(j);
+            s = ${op_name}(s, static_cast<_type_reduce>(in_arr[in_ind.get()]));
+        }
+        sdata[tid] = s;
+        __syncthreads();
+        for (unsigned int block = blockDim.x / 2; block >= block_stride; block >>= 1) {
+            if (tid < block) {
+                sdata[tid] = ${op_name}(sdata[tid], sdata[tid + block]);
+            }
+            __syncthreads();
+        }
+        if (tid < block_stride) {
+            s = sdata[tid];
+        }
+        if (tid < block_stride && i < out_ind.size()) {
+            out_ind.set(i);
+            ${postmap_name}(s, out_arr[out_ind.get()]);
+        }
+        __syncthreads();
+    }
+}''')  # NOQA
+        code = template.substitute(
+            name=self.name,
+            op_name=op_name,
+            postmap_name=postmap_name,
+            in_type=get_typename(in_param.dtype),
+            out_type=get_typename(out_param.dtype),
+            reduce_ctype=self.reduce_ctype,
+            reduce_expr=self.expr,
+            identity=self.identity,
+            postmap_cast=self.postmap_cast_code
+        )
+        return [code]
--- a/cupy/_core/_fusion_optimization.py
+++ b/cupy/_core/_fusion_optimization.py
+from cupy._core import _fusion_variable
+from cupy._core import _fusion_op
+def _reduce_memory_access(ops):
+    required_memories = set()
+    for op in ops:
+        for p in op.in_params + op.out_params:
+            if p.memory.is_inout:
+                required_memories.add(p.memory)
+    for op in ops[::-1]:
+        in_memories = set([p.memory for p in op.in_params])
+        new_out_params = []
+        for p in op.out_params:
+            if p.memory in required_memories:
+                new_out_params.append(p)
+        op.out_params = _fusion_variable._VariableSet(*new_out_params)
+        # TODO(asi1024): The following improvement can be applicable only
+        # when the memory space is used at most once.
+        # `required_memories -= out_memories`
+        required_memories |= in_memories
+    return [op for op in ops if len(op.out_params) > 0]
+def _normalize_ashapes(ops, variables, shape_constraints):
+    def normalize(shape):
+        return tuple([shape_constraints.evaluate(d) for d in shape])
+    for var in variables:
+        var.ashape = normalize(var.ashape)
+    for op in ops:
+        if isinstance(op, _fusion_op._ElementwiseTraceOp):
+            op.ashape = normalize(op.ashape)
+def _fuse_two_ops(op1, op2):
+    """Returns a fused Op if the two ops can be fused, and ``None`` otherwise.
+    """
+    # TODO(asi1024): Supoort reduction postmap.
+    if not isinstance(op1, _fusion_op._ElementwiseTraceOp):
+        return None
+    # TODO(asi1024): Supoort reduction premap.
+    if not isinstance(op2, _fusion_op._ElementwiseTraceOp):
+        return None
+    if op1.ashape != op2.ashape:
+        return None
+    new_in_params = op1.in_params + (op2.in_params - op1.out_params)
+    new_out_params = op1.out_params + op2.out_params
+    for in_param in new_in_params:
+        for out_param in new_out_params:
+            # Checks if two arrays may share the same memory space.
+            if in_param.memory == out_param.memory and in_param != out_param:
+                return None
+    op1.ops.extend(op2.ops)
+    op1.in_params = new_in_params
+    op1.out_params = new_out_params
+    return op1
+def _fuse_consecutive_ops(ops, shape_constraints):
+    res = []
+    for op in ops:
+        if len(res) == 0:
+            res.append(op)
+        else:
+            prev_op = res.pop(-1)
+            new_op = _fuse_two_ops(prev_op, op)
+            if new_op is None:
+                res.extend([prev_op, op])
+            else:
+                res.append(new_op)
+    return res
+def optimize(ops, variables, shape_constraints):
+    _normalize_ashapes(ops, variables, shape_constraints)
+    ops = _reduce_memory_access(ops)
+    ops = _fuse_consecutive_ops(ops, shape_constraints)
+    ops = _reduce_memory_access(ops)
+    return ops
--- a/cupy/_core/_fusion_thread_local.pyx
+++ b/cupy/_core/_fusion_thread_local.pyx
+import threading
+thread_local = threading.local()
+cpdef inline bint is_old_fusing() except? -1:
+    try:
+        return thread_local.is_old_fusing
+    except AttributeError:
+        thread_local.is_old_fusing = False
+    return False
+cpdef inline bint is_new_fusing() except? -1:
+    try:
+        return thread_local.is_new_fusing
+    except AttributeError:
+        thread_local.is_new_fusing = False
+    return False
+cpdef inline bint is_fusing() except? -1:
+    return is_old_fusing() or is_new_fusing()
+def check_not_runtime():
+    assert is_new_fusing()
+def call_ufunc(fusion_op, *args, **kwargs):
+    if is_new_fusing():
+        return thread_local.history.call_ufunc(fusion_op, *args, **kwargs)
+    import cupy
+    return cupy._core.fusion._call_ufunc(fusion_op, *args, **kwargs)
+def call_reduction(fusion_op, *args, **kwargs):
+    if is_new_fusing():
+        return thread_local.history.call_reduction(fusion_op, *args, **kwargs)
+    import cupy
+    return cupy._core.fusion._call_reduction(fusion_op, *args, **kwargs)
+def call_indexing(fusion_op, *args, **kwargs):
+    return thread_local.history.call_indexing(fusion_op, *args, **kwargs)
--- a/cupy/_core/_fusion_trace.pyx
+++ b/cupy/_core/_fusion_trace.pyx
+import numpy
+from cupy._core import _kernel
+from cupy._core import _reduction
+from cupy._core import core
+from cupy._core._fusion_interface import _VariableProxy
+from cupy._core._fusion_interface import _ArrayProxy
+from cupy._core import _fusion_thread_local
+from cupy._core import _fusion_variable
+from cupy._core._fusion_variable import _AbstractDim
+from cupy._core._fusion_variable import _TraceScalar
+from cupy._core._fusion_variable import _TraceArray
+from cupy._core._fusion_variable import _VariableSet
+from cupy._core import _fusion_op
+from cupy._core import _fusion_optimization
+from cupy._core cimport internal
+from cupy._core._dtype cimport _raise_if_invalid_cast
+_thread_local = _fusion_thread_local.thread_local
+_accepted_types = (int, float, bool, complex, numpy.generic)
+cdef class _ShapeConstraints:
+    """A data structure that manages the conditions between the shapes.
+    """
+    cdef:
+        # A list of tuple of _AbstractDim and _AbstractDim which represents
+        # the equality between dimensions.
+        readonly list eq_constraints
+        # A list of tuple of _AbstractDim and int which is an associative list
+        readonly list const_constraints
+    def __init__(self):
+        self.eq_constraints = []
+        self.const_constraints = []
+    def add_eq_constraint(self, x, y):
+        """Add a constraint: x == y.
+        """
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(x, (_AbstractDim, int))
+        assert isinstance(y, (_AbstractDim, int))
+        x = self.evaluate(x)
+        y = self.evaluate(y)
+        if x == y:
+            return
+        if isinstance(x, _AbstractDim) and isinstance(y, _AbstractDim):
+            self.eq_constraints.append((x, y))
+        elif isinstance(x, _AbstractDim) and not isinstance(y, _AbstractDim):
+            self.add_const_constraint(x, y)
+        elif not isinstance(x, _AbstractDim) and isinstance(y, _AbstractDim):
+            self.add_const_constraint(y, x)
+        else:
+            assert False
+    def add_const_constraint(self, x, value):
+        """Add a constraint: x == value.
+        """
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(x, (_AbstractDim, int))
+        assert isinstance(value, int)
+        x = self.evaluate(x)
+        if isinstance(x, _AbstractDim):
+            self.const_constraints.append((x, value))
+        else:
+            assert x == value
+    def evaluate(self, x):
+        """Substitute repeatedly from the equalities.
+        """
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(x, (_AbstractDim, int))
+        for src, dest in self.eq_constraints + self.const_constraints:
+            if isinstance(x, int):
+                return x
+            if x == src:
+                x = dest
+        return x
+    # Used in runtime.
+    def satisfy(self, dict dim_map):
+        """Check if the given dicionary satisfies the constraints.
+        Args:
+            dim_map (dict):
+                A dictionary with keys of _AbstractDim type and
+                values of int type.
+        """
+        for a, b in self.eq_constraints:
+            if dim_map[a] != dim_map[b]:
+                return False
+        for a, b in self.const_constraints:
+            if dim_map[a] != b:
+                return False
+        return True
+def _guess_routine(func, args, dtype):
+    assert isinstance(func, (_kernel.ufunc, _reduction._SimpleReductionKernel))
+    # Feeds dummy arguments with appropriate dtypes passed to `guess_routine`.
+    dummy_args = []
+    for x in args:
+        if isinstance(x, _TraceScalar):
+            obj = x.dtype.type(0)
+        else:
+            assert isinstance(x, _TraceArray)
+            obj = core.ndarray((0,), x.dtype)
+        dummy_args.append(obj)
+    op = func._ops.guess_routine(
+        func.name, func._routine_cache, dummy_args, dtype, None)
+    return op.get_in_dtypes(), op.get_out_dtypes(), op.routine
+def _base(array):
+    """Returns the base array object of given array.
+    """
+    assert isinstance(array, core.ndarray)
+    return array if array.base is None else array.base
+class _VariableCoordinator:
+    """Variable constuct manager.
+    This class calls ``_TraceArray`` or ``_TraceScalar`` internally
+    with unique serial numbers and returns the variable object. In
+    ``TraceImpl`` class, a method of ``history.vc``, which is of
+    ``_VariableConduductor`` class, should be called instead of
+    ```_TraceArray.__init__`` or ``_TraceScalar.__init__``.
+    """
+    def __init__(self):
+        self._memory_number = 0
+        self._serial_number = 0
+        self._variables_dict = {}
+    def _normalize_variable(self, var):
+        """If the input variable is already generated previously, returns it.
+        """
+        key = var.key()
+        if key not in self._variables_dict:
+            self._variables_dict[key] = var
+        return self._variables_dict[key]
+    def _generate_new_variable(self, var_module, dtype, **kwargs):
+        serial_number = self._serial_number
+        memory = _fusion_variable._MemorySpace(
+            self._memory_number, serial_number)
+        self._serial_number += 1
+        self._memory_number += 1
+        ret = var_module(memory, serial_number, dtype, **kwargs)
+        memory.is_input = ret.is_input
+        return self._normalize_variable(ret)
+    def generate_new_array(self, dtype, rshape, ashape, input_index=None):
+        """Generate new _TraceArray object with a new memory space.
+        """
+        ret = self._generate_new_variable(
+            _TraceArray,
+            dtype, rshape=rshape, ashape=ashape, input_index=input_index)
+        ret.memory.base_ashape = ret.ashape
+        return ret
+    def generate_new_scalar(self, dtype, **kwargs):
+        """Generate new _TraceScalar object with a new memory space.
+        """
+        return self._generate_new_variable(_TraceScalar, dtype, **kwargs)
+    def make_view(self, var, **kwargs):
+        assert isinstance(var, _TraceArray)
+        serial_number = self._serial_number
+        self._serial_number += 1
+        ret = var.make_view(serial_number, **kwargs)
+        return self._normalize_variable(ret)
+    def broadcast_to(self, var, ashape, rshape):
+        """Make a view of the input array with the given shape.
+        """
+        return self.make_view(
+            var, ashape=ashape, rshape=rshape, broadcasted_from=var)
+    def rotate_with_axis(self, var, axis):
+        """Make a view of an array by rotating ``var`` with given axis.
+        """
+        assert isinstance(var, _TraceArray)
+        return self.make_view(var, rotated_from=var, axis=axis)
+    def indexing(self, var, indices):
+        """Make a view of an array. by indexing ``var`` with given tuple.
+        """
+        skip = var.ndim - sum([isinstance(x, (int, slice)) for x in indices])
+        it = 0
+        ashape = []
+        rshape = []
+        if skip < 0:
+            raise IndexError('Too many indices for array.')
+        for index in indices:
+            if isinstance(index, int):
+                it += 1
+            elif isinstance(index, slice):
+                if not (index.start is None
+                        and index.stop is None
+                        and index.step in (1, -1, None)):
+                    raise NotImplementedError(
+                        'Only full range ``x[::]`` or reverse ``x[::-1]`` is '
+                        'supported for basic slicing in CuPy fusion.')
+                ashape.append(var.ashape[it])
+                rshape.append(var.rshape[it])
+                it += 1
+            elif index is None:
+                ashape.append(1)
+                rshape.append(1)
+            elif index is Ellipsis:
+                ashape.extend(var.ashape[it:it + skip])
+                rshape.extend(var.rshape[it:it + skip])
+                it += skip
+        ashape.extend(var.ashape[it:var.ndim])
+        rshape.extend(var.rshape[it:var.ndim])
+        return self.make_view(
+            var, indexed_from=var, index_key=indices,
+            ashape=tuple(ashape), rshape=tuple(rshape))
+    @property
+    def all_variables(self):
+        """Returns the list of all variables this class emitted.
+        """
+        return list(self._variables_dict.values())
+class TraceImpl:
+    """Emit a fused kernel from the given target function.
+    """
+    def __init__(self):
+        self.vc = _VariableCoordinator()
+        self.shape_constraints = _ShapeConstraints()
+        self.op_list = []
+    @staticmethod
+    def _make_interface(x):
+        """Returns an _array or a _scalar object which packs the given value.
+        """
+        if x is None:
+            return None
+        assert isinstance(x, _fusion_variable._TraceVariable)
+        return x.as_interface()
+    def _unwrap_interface(self, x, *, allow_none=False):
+        """Returns ``_TraceVariable`` object from the input.
+        """
+        if allow_none and x is None:
+            return None
+        if isinstance(x, _VariableProxy):
+            return x.content
+        if isinstance(x, _accepted_types):
+            dtype = numpy.dtype(type(x))
+            return self.vc.generate_new_scalar(dtype, const_value=x)
+        if isinstance(x, (numpy.ndarray, core.ndarray)):
+            raise TypeError('Concrete ndarray is not supported in fusion.')
+        raise TypeError('{} type is not supported'.format(type(x)))
+    def call_ufunc(self, ufunc, *args, **kwargs):
+        """Register an elementwise operation with the given parameters.
+        Args:
+            ufunc(_kernel.ufunc): The ufunc to operate.
+            args(tuple): The arguments.
+            kwargs(dict): The keyword arguments.
+        """
+        assert isinstance(ufunc, _kernel.ufunc)
+        # Parse Inputs.
+        nin = ufunc.nin
+        nout = ufunc.nout
+        dtype = kwargs.pop('dtype', None)
+        if 'out' in kwargs and len(args) > nin:
+            raise ValueError(
+                'cannot specify \'out\' as both a positional and '
+                'keyword argument')
+        in_params = [self._unwrap_interface(x) for x in args[:nin]]
+        out_params = [
+            self._unwrap_interface(x, allow_none=True)
+            for x in args[nin:] + (kwargs.pop('out', None),)
+            if x is not None
+        ]
+        params = in_params + out_params
+        if len(kwargs) > 0:
+            raise TypeError('Wrong arguments {}'.format(kwargs))
+        if len(in_params) != nin or len(out_params) > nout:
+            raise ValueError('Invalid number of arguments')
+        if not all([isinstance(v, _TraceArray) for v in out_params]):
+            raise TypeError('Return arrays must be of ArrayType')
+        # Check for inplace operation.
+        for i, out_param1 in enumerate(out_params):
+            for out_param2 in out_params[:i]:
+                if out_param1.memory == out_param2.memory:
+                    # NumPy does not raise this error.
+                    raise ValueError('Outputs of ufunc must not share memory')
+        # Copy the input array data before the operation when the input array
+        # shares the same memory area with an output array.
+        for i, in_param in enumerate(in_params):
+            should_copy = any([
+                in_param.memory == out_param.memory and in_param != out_param
+                for out_param in out_params
+            ])
+            if should_copy:
+                in_params[i] = self._unwrap_interface(
+                    self.call_ufunc(
+                        core.elementwise_copy,
+                        self._make_interface(in_param)))
+        # Broadcast shapes
+        out_rshape = internal._broadcast_shapes([p.rshape for p in params])
+        out_ashape = [None for _ in range(len(out_rshape))]
+        for p in params:
+            for axis in range(-p.ndim, 0):
+                if p.rshape[axis] == out_rshape[axis]:
+                    out_ashape[axis] = p.ashape[axis]
+        assert all([dim is not None for dim in out_ashape])
+        out_ashape = tuple(out_ashape)
+        # Broadcast input params and make their views.
+        for i, p in enumerate(in_params):
+            for axis in range(-p.ndim, 0):
+                if p.rshape[axis] == out_rshape[axis]:
+                    self.shape_constraints.add_eq_constraint(
+                        p.ashape[axis], out_ashape[axis])
+                elif p.rshape[axis] == 1:
+                    self.shape_constraints.add_const_constraint(
+                        p.ashape[axis], 1)
+                else:
+                    assert False
+            if isinstance(p, _TraceArray) and p.rshape != out_rshape:
+                # Broadcst input if needed.
+                in_params[i] = self.vc.broadcast_to(p, out_ashape, out_rshape)
+        # Get operation code from dtypes.
+        in_dtypes, out_dtypes, expr = _guess_routine(
+            ufunc, in_params, dtype)
+        # Make output arrays.
+        ret = []
+        for i in range(nout):
+            if i >= len(out_params):
+                # Omitted output.
+                out_pvar = self.vc.generate_new_array(
+                    out_dtypes[i], out_rshape, out_ashape)
+                out_params.append(out_pvar)
+            elif isinstance(out_params, _TraceScalar):
+                raise TypeError('return arrays must be of ArrayType')
+            elif out_params[i].rshape != out_rshape:
+                raise ValueError(
+                    'non-broadcastable output operand with shape {} '
+                    'doesn\'t match the broadcast shape {}'.format(
+                        out_params[i].rshape, out_rshape))
+            _raise_if_invalid_cast(
+                out_dtypes[i], out_params[i].dtype, 'same_kind',
+                'output operand')
+            out_pvar = out_params[i]
+            ret.append(out_pvar)
+        # Register Op.
+        name = ufunc.name + '_' + str(len(self.op_list))
+        ufunc_routine = _fusion_op._UfuncRoutine(
+            name, ufunc, expr, in_params, out_params, in_dtypes + out_dtypes)
+        op = _fusion_op._ElementwiseTraceOp(
+            [ufunc_routine], in_params, out_params, out_ashape)
+        self.op_list.append(op)
+        # Returns.
+        assert len(ret) > 0
+        if len(ret) == 1:
+            return self._make_interface(ret[0])
+        else:
+            return tuple([self._make_interface(x) for x in ret])
+    def call_reduction(
+            self, reduce_func, a, axis=None, dtype=None, out=None,
+            keepdims=False):
+        """Register a reduction operation with the given parameters.
+        Args:
+            reduce_func(_reduction._SimpleReductionKernel):
+                The reduction function to operate.
+            a(array_like): The input array.
+            axis(int, tuple of int or None): The axis.
+            dtype(numpy.dtype or None): The dtype
+            out(_array or None): The output array.
+        """
+        assert isinstance(reduce_func, _reduction._SimpleReductionKernel)
+        # Parse inputs.
+        in_param = self._unwrap_interface(a)
+        if not isinstance(in_param, _TraceArray):
+            raise NotImplementedError(
+                'Reduction for scalar arguments is not supported.')
+        axes = internal._normalize_axis_indices(axis, in_param.ndim)
+        if dtype is not None:
+            dtype = numpy.dtype(dtype)
+        if keepdims:
+            raise NotImplementedError('keepdims is not supported.')
+        # Determine the shape of out_param.
+        out_ashape = tuple([
+            d for axis, d in enumerate(in_param.ashape) if axis not in axes])
+        out_rshape = tuple([
+            d for axis, d in enumerate(in_param.rshape) if axis not in axes])
+        # Rotate axes.
+        # This condition is only for performance improvement,
+        if not all([i == axis for i, axis in enumerate(axes)]):
+            in_param = self.vc.rotate_with_axis(in_param, axes)
+        # Get operation code from dtypes.
+        _, (out_dtype,), expr = _guess_routine(reduce_func, [in_param], dtype)
+        # Make an output array.
+        if out is None:
+            # Omitted output.
+            out_param = self.vc.generate_new_array(
+                out_dtype, out_rshape, out_ashape)
+        else:
+            out_param = self._unwrap_interface(out)
+            if out_param.rshape != out_rshape:
+                raise ValueError(
+                    'Shape of specified output variable is not consistent '
+                    'with reduced shape.')
+        # Register Op.
+        name = 'reduce{}'.format(len(self.op_list))
+        op = _fusion_op._ReductionTraceOp(
+            name, reduce_func, expr, in_param, out_param, axes)
+        self.op_list.append(op)
+        # Returns.
+        return self._make_interface(out_param)
+    def call_indexing(self, in_param, indices):
+        """Call indexing routines.
+        """
+        in_param = self._unwrap_interface(in_param)
+        if not isinstance(indices, tuple):
+            indices = (indices,)
+        for x in indices:
+            if isinstance(indices, (list, _TraceArray)):
+                # Advanced indexing
+                raise NotImplementedError(
+                    'Advanced indexing is not supported, currently.')
+            if not (isinstance(x, (int, slice)) or x is None or x is Ellipsis):
+                raise IndexError(
+                    'Indices must be integers, slices, ellipsis, None or '
+                    'integer or boolean arrays.')
+        # Basic indexing
+        out_param = self.vc.indexing(in_param, indices)
+        return self._make_interface(out_param)
+    def trace(self, func, args):
+        """Call ``self.func`` with _TraceVariable arguments.
+        Returns:
+            out_params(list of _TraceVariable): The list of outputs.
+            return_size(int or str): If ``return_size`` is of int type,
+                it indicates the size of tuple of outputs.
+                If `none`, the output is ``None`` and ``out_params`` is empty.
+                If `single`, the output is single array and ``out_params``
+                is a singleton list.
+        During the function call, ``call_ufunc``, ``call_reduction`` and
+        ``call_indexing`` are called internally.
+        """
+        # Register input variables.
+        in_params = []
+        array_dict = {}
+        memory_dict = {}
+        for input_index, arg in enumerate(args):
+            if arg is None:
+                var = None
+            elif isinstance(arg, core.ndarray):
+                arg_id = id(arg)
+                base_id = id(_base(arg))
+                if arg_id in array_dict:
+                    # The array is already given as an input.
+                    var = in_params[array_dict[arg_id]]
+                    assert isinstance(var, _TraceArray)
+                elif base_id in memory_dict:
+                    # The is an array which shares the same memory.
+                    base = in_params[memory_dict[base_id]]
+                    assert isinstance(base, _TraceArray)
+                    var = self.vc.make_view(base, input_index=input_index)
+                else:
+                    # Otherwise.
+                    var = self.vc.generate_new_array(
+                        arg.dtype, arg.shape, None, input_index=input_index)
+                array_dict[arg_id] = input_index
+                memory_dict[base_id] = input_index
+            else:
+                # Scalar input.
+                dtype = numpy.dtype(type(arg))
+                var = self.vc.generate_new_scalar(
+                    dtype, input_index=input_index)
+            in_params.append(var)
+        # Call the target function.
+        inputs = [self._make_interface(x) for x in in_params]
+        output = func(*inputs)
+        # Register output variables.
+        if output is None:
+            return_size = 'none'
+            out_params = []
+        elif isinstance(output, _ArrayProxy):
+            return_size = 'single'
+            out_params = [self._unwrap_interface(output, allow_none=True)]
+        elif isinstance(output, tuple):
+            if all(isinstance(x, _ArrayProxy) for x in output):
+                return_size = len(output)
+                out_params = [
+                    self._unwrap_interface(x, allow_none=True) for x in output]
+            else:
+                raise ValueError(
+                    'The all elements of return value of fused function '
+                    'must be of _ArrayProxy type.'
+                )
+        else:
+            raise ValueError(
+                'The return value of fused functions must be `None`, '
+                'ndarray or a tuple of ndarays.'
+            )
+        for output_index, out_param in enumerate(out_params):
+            assert isinstance(out_param, _TraceArray)
+            out_param.output_index = output_index
+            out_param.memory.is_output = True
+        return out_params, return_size
+def _get_ancestors_of_trace_variable(var):
+    if var is None:
+        return _VariableSet()
+    res = _VariableSet(var)
+    if isinstance(var, _TraceArray):
+        res += _get_ancestors_of_trace_variable(var._view_of)
+    return res
+class _TraceResult:
+    def __init__(self, op_list, params, return_size, shape_constraints):
+        self.op_list = op_list
+        self.params = params
+        self.return_size = return_size
+        self.shape_constraints = shape_constraints
+def trace(func, args):
+    history = TraceImpl()
+    try:
+        _thread_local.history = history
+        # Call `func(args)` and update `op_list`.
+        out_params, return_size = history.trace(func, args)
+    finally:
+        _thread_local.history = None
+    op_list = history.op_list
+    shape_constraints = history.shape_constraints
+    all_variables = history.vc.all_variables
+    op_list = _fusion_optimization.optimize(
+        op_list, all_variables, shape_constraints)
+    # Make info passed to FusedKernel.
+    kernel_params = _VariableSet()
+    for p in out_params:
+        kernel_params += _get_ancestors_of_trace_variable(p)
+    for op in op_list:
+        for p in op.in_params + op.out_params:
+            kernel_params += _get_ancestors_of_trace_variable(p)
+    kernel_params = list(kernel_params)
+    # used in mock tests.
+    history.kernel_params = kernel_params
+    history.op_list = op_list
+    return _TraceResult(op_list, kernel_params, return_size, shape_constraints)
--- a/cupy/_core/_fusion_variable.pxd
+++ b/cupy/_core/_fusion_variable.pxd
+cdef class _AbstractDim:
+    cdef:
+        readonly int input_index
+        readonly int axis
--- a/cupy/_core/_fusion_variable.pyx
+++ b/cupy/_core/_fusion_variable.pyx
+import string
+import numpy
+from cupy._core import _fusion_interface
+from cupy._core._scalar cimport get_typename
+cdef class _AbstractDim:
+    """An abstracted data structure for a length of dimensions.
+    Attributes:
+        input_index (int):
+            The position of the element in the arguments passed to the
+            fused function
+        axis (int):
+            The index of dimensions
+    """
+    def __init__(self, int input_index, int axis):
+        self.input_index = input_index
+        self.axis = axis
+    def __hash__(self):
+        return hash((self.input_index, self.axis))
+    def __eq__(self, object other):
+        if isinstance(other, int):
+            return False
+        return (
+            self.input_index == other.input_index
+            and self.axis == other.axis
+        )
+class _MemorySpace:
+    """A memory space object.
+    Attributes:
+        id(int): The serial number of memory space.
+        base_serial_number(int): The serial number of the base variable
+            which have this memory space.
+        is_input(bool): If this is set to ``True``, the memory space is
+            already allocated as an input array. If this is set to ``False``,
+            the memory space should be allocated before launching the kernel.
+        is_output(bool): If this is set to ``True``, the memory space is
+            used in the return values.
+    """
+    def __init__(self, memory_id, base_serial_number):
+        assert isinstance(memory_id, int)
+        assert isinstance(base_serial_number, int)
+        self.id = memory_id
+        self.base_serial_number = base_serial_number
+        # Initially, these attributes are set to be `False`, but might be
+        # updated from outside.
+        self.is_input = False
+        self.is_output = False
+    @property
+    def is_inout(self):
+        """Returns ``True`` if the memory space is used for inputs or outputs.
+        If ``True``, the memory space should not be deallocated just after
+        the kernel launch. If ``False``, the memory space is used only for
+        temporary value in the fused kernel."""
+        return self.is_input or self.is_output
+class _TraceVariable:
+    """Variable object to trace operations in the target function to be fused.
+    Attributes:
+        index(_MemorySpace): The memory space the variable uses.
+        serial_number(int): The serial number of the variable object.
+        dtype(dtype): The dtype of the variable.
+        rshape(tuple of int): The real shape of the variable.
+        ashape(tuple of _AbstractDim): An abstracted shape of the variable.
+        input_index(int or None): If not `None`, this variable is used as
+            the `input_index`-th input parameter.
+        output_index(int or None): If not `None`, this variable is used as
+            the `output_index`-th output parameter.
+    """
+    def __init__(
+            self, memory_space, serial_number, dtype, rshape, ashape,
+            input_index, output_index):
+        assert isinstance(memory_space, _MemorySpace)
+        assert isinstance(serial_number, int)
+        assert isinstance(dtype, numpy.dtype)
+        assert input_index is None or isinstance(input_index, int)
+        assert output_index is None or isinstance(output_index, int)
+        assert isinstance(rshape, tuple)
+        assert isinstance(ashape, tuple)
+        assert len(rshape) == len(ashape)
+        for rdim, adim in zip(rshape, ashape):
+            assert isinstance(rdim, int)
+            assert isinstance(adim, (int, _AbstractDim))
+        self.memory = memory_space
+        self.serial_number = serial_number
+        self.dtype = dtype
+        self.rshape = rshape
+        self.ashape = ashape
+        self.input_index = input_index
+        self.output_index = output_index
+    @property
+    def ndim(self):
+        return len(self.ashape)
+    @property
+    def is_base(self):
+        return self.serial_number == self.memory.base_serial_number
+    @property
+    def is_input(self):
+        return self.input_index is not None
+    @property
+    def is_output(self):
+        return self.output_index is not None
+    @property
+    def var_name(self):
+        # The name of varialbe stored in global memory space.
+        raise NotImplementedError
+    @property
+    def lvar_name(self):
+        # The name of varialbe stored in registers in each thread.
+        raise NotImplementedError
+    @property
+    def indexer_name(self):
+        """The name of CUDA CIndxer variable for the variable.
+        """
+        # TODO(asi1024): Unify indexer with other variables which have the
+        # same shape, for performance improvements.
+        return 'ind{}_{}'.format(self.memory.id, self.serial_number)
+    def format(self, form, **kwargs):
+        """Returns a string following the format taken as an input.
+        """
+        kwargs = dict([
+            (k, get_typename(v) if isinstance(v, numpy.dtype) else v)
+            for k, v in kwargs.items()]
+        )
+        return string.Template(form).substitute(
+            type=get_typename(self.dtype),
+            var=self.var_name,
+            lvar=self.lvar_name,
+            indexer=self.indexer_name,
+            **kwargs
+        )
+    def __hash__(self):
+        assert False, (
+            '__hash__ is not defined. Use _VariableSet instead of '
+            'set/dict because they do not guarantee the order of contents.')
+class _TraceScalar(_TraceVariable):
+    """An abstracted scalar object.
+    Attributes:
+        const_value(scalar object or None): A compile-time constant value.
+            Actually, it is `None` iff self.is_input is `True`.
+    """
+    # TODO(asi1024): Remove index argument.
+    def __init__(
+            self, index, serial_number, dtype, input_index=None, *,
+            const_value=None,):
+        super().__init__(
+            index, serial_number, dtype, (), (), input_index, None)
+        self.const_value = const_value
+    @property
+    def var_name(self):
+        if self.const_value is None:
+            return 'a{}'.format(self.memory.id)
+        if self.dtype == '?':
+            return str(self.const_value).lower()
+        if self.dtype.kind == 'c':
+            return '{}({}, {})'.format(
+                get_typename(self.dtype),
+                self.const_value.real,
+                self.const_value.imag)
+        return str(self.const_value)
+    @property
+    def lvar_name(self):
+        return 'v{}'.format(self.memory.id)
+    def as_interface(self):
+        return _fusion_interface._ScalarProxy(self)
+    def key(self):
+        return (self.memory.id,)
+class _TraceArray(_TraceVariable):
+    """An abstracted array object.
+    Attributes:
+        broadcasted_from(_TraceArray optional): TODO
+        rotated_from(_TraceArray optional): TODO
+        axis(int optional): The axis to rotate.
+        indexed_from(_TraceArray optional): TODO
+        index_key(slice): TODO
+    """
+    def __init__(
+            self, index, serial_number, dtype, input_index=None,
+            output_index=None, *, rshape, ashape, **kwargs):
+        if ashape is None:
+            assert input_index is not None
+            ndim = len(rshape)
+            ashape = tuple([
+                _AbstractDim(input_index, axis) for axis in range(ndim)])
+        super().__init__(
+            index, serial_number, dtype, rshape, ashape,
+            input_index, output_index)
+        self._view_of = None
+        self.is_broadcast = False
+        self.rotate_axis = None
+        self.slice_key = None
+        if 'broadcasted_from' in kwargs:
+            self._view_of = kwargs.pop('broadcasted_from')
+            self.is_broadcast = True
+        elif 'rotated_from' in kwargs:
+            self._view_of = kwargs.pop('rotated_from')
+            self.rotate_axis = kwargs.pop('axis')
+        elif 'indexed_from' in kwargs:
+            self._view_of = kwargs.pop('indexed_from')
+            self.slice_key = kwargs.pop('index_key')
+        assert len(kwargs) == 0, kwargs
+    @property
+    def var_name(self):
+        return 'a{}_{}'.format(self.memory.id, self.serial_number)
+    @property
+    def lvar_name(self):
+        return 'v{}_{}'.format(self.memory.id, self.serial_number)
+    def as_interface(self):
+        return _fusion_interface._ArrayProxy(self)
+    def make_view(self, serial_number, **kwargs):
+        rshape = kwargs.pop('rshape', self.rshape)
+        ashape = kwargs.pop('ashape', self.ashape)
+        return _TraceArray(
+            self.memory, serial_number, self.dtype,
+            rshape=rshape, ashape=ashape, **kwargs)
+    def key(self):
+        """Two variables can be identified if they have the same key.
+        """
+        if isinstance(self.slice_key, tuple):
+            slice_key = []
+            for s in self.slice_key:
+                if isinstance(s, slice):
+                    if not (s.start is None
+                            and s.stop is None
+                            and s.step in (None, 1, -1)):
+                        raise NotImplementedError(
+                            'Basic slice supports only x[::] and x[::-1].')
+                    slice_key.append((s.start, s.stop, s.step))
+                else:
+                    slice_key.append(s)
+            slice_key = tuple(slice_key)
+        else:
+            slice_key = self.slice_key
+        return (
+            self.memory.id, self.ashape, self.input_index,
+            getattr(self._view_of, 'serial_number', None),
+            self.is_broadcast, self.rotate_axis, slice_key,
+        )
+class _VariableSet:
+    """A stable set of variables
+    """
+    def __init__(self, *args):
+        self.contents = []
+        for x in args:
+            assert isinstance(x, _TraceVariable)
+            if x not in self.contents:
+                self.contents.append(x)
+    def __len__(self):
+        return len(self.contents)
+    def item(self):
+        assert len(self.contents) == 1
+        return self.contents[0]
+    def add(self, x):
+        if x not in self.contents:
+            self.contents.append(x)
+    def __iadd__(self, other):
+        assert isinstance(other, _VariableSet)
+        for x in other.contents:
+            self.add(x)
+        return self
+    def __add__(self, other):
+        res = _VariableSet(*self.contents)
+        res += other
+        return res
+    def __contains__(self, elem):
+        return elem in self.contents
+    def __iter__(self):
+        return iter(self.contents)
+    def __isub__(self, other):
+        assert isinstance(other, _VariableSet)
+        for x in other.contents:
+            if x in self.contents:
+                self.contents.remove(x)
+        return self
+    def __sub__(self, other):
+        res = _VariableSet(*self.contents)
+        res -= other
+        return res
--- a/cupy/_core/_gufuncs.py
+++ b/cupy/_core/_gufuncs.py
+import re
+import numpy
+import cupy
+import cupy._core._routines_manipulation as _manipulation
+from cupy._core._dtype import get_dtype, _raise_if_invalid_cast
+from cupy._core import internal
+# Signature parsing code and dimension accessing has been borrowed
+# from dask
+# https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L12-L55
+_DIMENSION_NAME = r'\w+\?*'
+_CORE_DIMENSION_LIST = '(?:{0:}(?:,{0:})*,?)?'.format(_DIMENSION_NAME)
+_ARGUMENT = r'\({}\)'.format(_CORE_DIMENSION_LIST)
+_INPUT_ARGUMENTS = '(?:{0:}(?:,{0:})*,?)?'.format(_ARGUMENT)
+_OUTPUT_ARGUMENTS = '{0:}(?:,{0:})*'.format(
+    _ARGUMENT
+)  # Use `'{0:}(?:,{0:})*,?'` if gufunc-
+# signature should be allowed for length 1 tuple returns
+_SIGNATURE = '^{0:}->{1:}$'.format(_INPUT_ARGUMENTS, _OUTPUT_ARGUMENTS)
+def _parse_gufunc_signature(signature):
+    # The code has been modifyed from dask to support optional dimensions
+    if not isinstance(signature, str):
+        raise TypeError('Signature is not a string')
+    if signature == '' or signature is None:
+        raise ValueError('Signature cannot be empty')
+    signature = signature.replace(' ', '')
+    if not re.match(_SIGNATURE, signature):
+        raise ValueError('Not a valid gufunc signature: {}'.format(signature))
+    in_txt, out_txt = signature.split('->')
+    ins = [tuple(x.split(',')) if x != '' else ()
+           for x in in_txt[1:-1].split('),(')]
+    outs = [tuple(y.split(',')) if y != '' else ()
+            for y in out_txt[1:-1].split('),(')]
+    # TODO(ecastill) multiple output support
+    if len(outs) > 1:
+        raise ValueError('Currently more than 1 output is not supported')
+    return ins, outs
+def _validate_normalize_axes(
+    axes, axis, keepdims, input_coredimss, output_coredimss
+):
+    # This code credit goes to Dask
+    # https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L58-L172
+    nin = len(input_coredimss)
+    nout = (
+        1 if not isinstance(output_coredimss, list) else len(output_coredimss)
+    )
+    if axes is not None and axis is not None:
+        raise ValueError(
+            'Only one of `axis` or `axes` keyword arguments should be given')
+    if axes and not isinstance(axes, list):
+        raise ValueError('`axes` has to be of type list')
+    # output_coredimss = output_coredimss if nout > 1 else [output_coredimss]
+    filtered_core_dims = list(filter(len, input_coredimss))
+    nr_outputs_with_coredims = len(
+        [True for x in output_coredimss if len(x) > 0])
+    if keepdims:
+        if nr_outputs_with_coredims > 0:
+            raise ValueError('`keepdims` can only be used for scalar outputs')
+        output_coredimss = len(output_coredimss) * [filtered_core_dims[0]]
+    core_dims = input_coredimss + output_coredimss
+    if axis is not None:
+        if not isinstance(axis, int):
+            raise ValueError('`axis` argument has to be an integer value')
+        if filtered_core_dims:
+            cd0 = filtered_core_dims[0]
+            if len(cd0) != 1:
+                raise ValueError(
+                    '`axis` can be used only, if one core dimension is present'
+                )
+            for cd in filtered_core_dims:
+                if cd0 != cd:
+                    raise ValueError(
+                        'To use `axis`, all core dimensions have to be equal'
+                    )
+    # Expand defaults or axis
+    if axes is None:
+        if axis is not None:
+            axes = [(axis,) if cd else tuple() for cd in core_dims]
+        else:
+            axes = [tuple(range(-len(icd), 0)) for icd in core_dims]
+    axes = [(a,) if isinstance(a, int) else a for a in axes]
+    if (
+        (nr_outputs_with_coredims == 0)
+        and (nin != len(axes))
+        and (nin + nout != len(axes))
+    ) or ((nr_outputs_with_coredims > 0) and (nin + nout != len(axes))):
+        raise ValueError(
+            'The number of `axes` entries is not equal the number'
+            ' of input and output arguments')
+    # Treat outputs
+    output_axes = axes[nin:]
+    output_axes = (
+        output_axes
+        if output_axes
+        else [tuple(range(-len(ocd), 0)) for ocd in output_coredimss]
+    )
+    input_axes = axes[:nin]
+    # Assert we have as many axes as output core dimensions
+    for idx, (iax, icd) in enumerate(zip(input_axes, input_coredimss)):
+        if len(iax) != len(icd):
+            raise ValueError(
+                f'The number of `axes` entries for argument #{idx}'
+                ' is not equal the number of respective input core'
+                ' dimensions in signature')
+    if not keepdims:
+        for idx, (oax, ocd) in enumerate(zip(output_axes, output_coredimss)):
+            if len(oax) != len(ocd):
+                raise ValueError(
+                    f'The number of `axes` entries for argument #{idx}'
+                    ' is not equal the number of respective output core'
+                    ' dimensions in signature')
+    else:
+        if input_coredimss:
+            icd0 = input_coredimss[0]
+            for icd in input_coredimss:
+                if icd0 != icd:
+                    raise ValueError(
+                        'To use `keepdims`, all core dimensions'
+                        ' have to be equal')
+            iax0 = input_axes[0]
+            output_axes = [iax0 for _ in output_coredimss]
+    return input_axes, output_axes
+class _OpsRegister:
+    '''
+    Holds the ops for each dtypes signature like ('ff->f', func1)
+    and allows to do look ups for these
+    '''
+    class _Op:
+        def __init__(self, in_types, out_types, func):
+            self.func = func
+            self.in_types = tuple(numpy.dtype(i) for i in in_types)
+            self.out_types = tuple(numpy.dtype(o) for o in out_types)
+            self.sig_str = (''.join(
+                in_t.char for in_t in self.in_types) + '->' + ''.join(
+                    out_t.char for out_t in self.out_types))
+    def __init__(self, signatures, default_func, nin, nout, name):
+        self._default_func = default_func
+        self._nin = nin
+        self._nout = nout
+        self._ops = self._process_signatures(signatures)
+        self._name = name
+    def _sig_str_to_tuple(self, sig):
+        sig = sig.replace(' ', '')
+        toks = sig.split('->')
+        if len(toks) != 2:
+            raise ValueError(f'signature {sig} for dtypes is invalid')
+        else:
+            ins, outs = toks
+        return ins, outs
+    def _process_signatures(self, signatures):
+        ops = []
+        for sig in signatures:
+            if isinstance(sig, tuple):
+                sig, op = sig
+            else:
+                op = self._default_func
+            ins, outs = self._sig_str_to_tuple(sig)
+            # Check the number of inputs and outputs matches the gufunc sig
+            if len(ins) != self._nin:
+                raise ValueError(
+                    f'signature {sig} for dtypes is invalid number of inputs '
+                    'is not consistent with general signature')
+            if len(outs) != self._nout:
+                raise ValueError(
+                    f'signature {sig} for dtypes is invalid number of inputs '
+                    'is not consistent with general signature')
+            ops.append(_OpsRegister._Op(ins, outs, op))
+        return ops
+    def _determine_from_args(self, args, casting):
+        n = len(args)
+        in_types = tuple(arg.dtype for arg in args)
+        for op in self._ops:
+            op_types = op.in_types
+            for i in range(n):
+                it = in_types[i]
+                ot = op_types[i]
+                if not numpy.can_cast(it, ot, casting=casting):
+                    break
+            else:
+                return op
+        return None
+    def _determine_from_dtype(self, dtype):
+        for op in self._ops:
+            op_types = op.out_types
+            for t in op_types:
+                if t != dtype:
+                    break
+            else:
+                return op
+        return None
+    def _determine_from_signature(self, signature):
+        # Lets convert the signature as it can be a tuple of tuples
+        # or a string
+        if isinstance(signature, tuple):
+            # create a string to do a look-up on the ops
+            if len(signature) == 1:
+                raise TypeError(
+                    'The use of a length 1 tuple for the ufunc `signature` is'
+                    ' not allowed. Use `dtype` or  fill the tuple with'
+                    ' `None`s.')
+            nin = self._nin
+            nout = self._nout
+            if len(signature) != (nin + nout):
+                raise TypeError(
+                    'A type-tuple must be specified of length 1 or 3 for ufunc'
+                    f' {self._name}')
+            signature = ''.join(
+                numpy.dtype(t).char for t in signature[:nin]) + '->' + ''.join(
+                    numpy.dtype(t).char for t in signature[nin:nin+nout])
+        if isinstance(signature, str):
+            is_out = len(signature) == 1
+            for op in self._ops:
+                if is_out:
+                    for t in op.out_types:
+                        if t.char != signature:
+                            break
+                    else:
+                        return op
+                else:
+                    if op.sig_str == signature:
+                        return op
+        raise TypeError('No loop matching the specified signature and'
+                        f' casting was found for ufunc {self._name}')
+    def determine_dtype(self, args, dtype, casting, signature):
+        ret_dtype = None
+        func = self._default_func
+        if signature is not None:
+            # TODO(ecastill) use an externally provided signature to
+            # find the typecasting rules
+            op = self._determine_from_signature(signature)
+        elif dtype is not None:
+            if type(dtype) == tuple:
+                # TODO(ecastill) support dtype tuples
+                raise RuntimeError('dtype with tuple is not yet supported')
+            op = self._determine_from_dtype(dtype)
+        else:
+            op = self._determine_from_args(args, casting)
+        if op is None:
+            # Should we allow op to be none?
+            if dtype is None:
+                dtype = args[0].dtype
+                for arg in args:
+                    ret_dtype = numpy.promote_types(dtype, arg.dtype)
+            else:
+                ret_dtype = get_dtype(dtype)
+        else:
+            # Convert args to the op specified in_types
+            n_args = []
+            def argname(): return f'ufunc {self._name} input {i}'
+            for i, (arg, in_type) in enumerate(zip(args, op.in_types)):
+                _raise_if_invalid_cast(arg.dtype, in_type, casting, argname)
+                n_args.append(arg.astype(in_type, copy=False))
+            args = n_args
+            ret_dtype = op.out_types[0]
+            func = op.func
+        return args, ret_dtype, func
+class _GUFunc:
+    '''
+    Creates a Generalized Universal Function by wrapping a user
+    provided function with the signature.
+    ``signature`` determines if the function consumes or produces core
+    dimensions. The remaining dimensions in given input arrays (``*args``)
+    are considered loop dimensions and are required to broadcast
+    naturally against each other.
+    Args:
+        func (callable):
+            Function to call like ``func(*args, **kwargs)`` on input arrays
+            (``*args``) that returns an array or tuple of arrays. If
+            multiple arguments with non-matching dimensions are supplied,
+            this function is expected to vectorize (broadcast) over axes of
+            positional arguments in the style of NumPy universal functions.
+        signature (string):
+            Specifies what core dimensions are consumed and produced by
+            ``func``.  According to the specification of numpy.gufunc
+            signature.
+        supports_batched (bool, optional):
+            If the wrapped function supports to pass the complete input
+            array with the loop and the core dimensions.
+            Defaults to `False`. Dimensions will be iterated in the
+            `GUFunc` processing code.
+        supports_out (bool, optional):
+            If the wrapped function supports out as one of its kwargs.
+            Defaults to `False`.
+        signatures (list of tuple of str):
+            Contains strings in the form of 'ii->i' with i being the char of a
+            dtype. Each element of the list is a tuple with the string
+            and a alternative function to `func` to be executed when the inputs
+            of the function can be casted as described by this function.
+        name (str, optional):
+            Name for the GUFunc object. If not specified, ``func``'s name
+            is used.
+        doc (str, optional):
+            Docstring for the GUFunc object. If not specified, ``func.__doc__``
+            is used.
+    '''
+    def __init__(self, func, signature, **kwargs):
+        # We would like to create gufuncs from cupy regular ufuncs
+        # so we can avoid most of the __call__ stuff
+        self._func = func
+        self._signature = signature
+        self.__name__ = kwargs.pop('name', func.__name__)
+        self.__doc__ = kwargs.pop('doc', func.__doc__)
+        # The following are attributes to avoid applying certain steps
+        # when wrapping cupy functions that do some of the gufunc
+        # stuff internally due to CUDA libraries requirements
+        self._supports_batched = kwargs.pop('supports_batched', False)
+        self._supports_out = kwargs.pop('supports_out', False)
+        signatures = kwargs.pop('signatures', [])
+        if kwargs:
+            raise TypeError(
+                'got unexpected keyword arguments: '
+                + ', '.join([repr(k) for k in kwargs])
+            )
+        # Preprocess the signature here
+        input_coredimss, output_coredimss = _parse_gufunc_signature(
+            self._signature)
+        self._input_coredimss = input_coredimss
+        self._output_coredimss = output_coredimss
+        # This is pre-calculated to later check the minimum number of
+        # dimensions required per input
+        self._min_dims = [0] * len(input_coredimss)
+        for i, inp in enumerate(input_coredimss):
+            for d in inp:
+                if d[-1] != '?':
+                    self._min_dims[i] += 1
+        # Determine nout: nout = None for functions of one
+        # direct return; nout = int for return tuples
+        self._nout = (
+            0
+            if not isinstance(output_coredimss, list)
+            else len(output_coredimss)
+        )
+        self._nin = (
+            0
+            if not isinstance(input_coredimss, list)
+            else len(input_coredimss)
+        )
+        # Determines the function that will be run depending on the datatypes
+        # Pass a list of signatures that are either the types in format
+        # ii->o or a tuple with the string and a function other than func to be
+        # executed for those types
+        # For some reason _nout is a tuple and now we get it with 0s
+        self._ops_register = _OpsRegister(
+            signatures, self._func, self._nin, self._nout, self.__name__)
+    def _apply_func_to_inputs(self, func, dim, sizes, dims, args, outs):
+        # Apply function
+        # The resulting array is loop_output_dims+the specified dims
+        # Some functions have batching logic inside due to higly
+        # optimized CUDA libraries so we just call them
+        if self._supports_batched or dim == len(dims):
+            # Check if the function supports out, order and other args
+            if self._supports_out and outs is not None:
+                outs = outs[0] if len(outs) == 1 else outs
+                func(*args, out=outs)
+            else:
+                fouts = func(*args)
+                # TODO(ecastill) improve this check
+                if isinstance(fouts, cupy.ndarray):
+                    fouts = (fouts,)
+                for o, fo in zip(outs, fouts):
+                    cupy._core.elementwise_copy(fo, o)
+        else:
+            dim_size = sizes[dims[dim]][0]
+            for i in range(dim_size):
+                n_args = [a[i] for a in args]
+                if outs is not None:
+                    n_outs = [o[i] for o in outs]
+                    self._apply_func_to_inputs(
+                        func, dim + 1, sizes, dims, n_args, n_outs)
+    def _transpose_element(self, arg, iax, shape):
+        iax = tuple(a if a < 0 else a - len(shape) for a in iax)
+        tidc = (
+            tuple(i for i in range(
+                -len(shape) + 0, 0) if i not in iax) + iax
+        )
+        return arg.transpose(tidc)
+    def _get_args_transposed(self, args, input_axes, outs, output_axes):
+        # This code credit goes to Dask
+        # https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L349-L377
+        # modifications have been done to support arguments broadcast
+        # out argument, and optional core dims.
+        transposed_args = []
+        # This is used when reshaping the outputs so that we can delete
+        # dims that were not specified in the input
+        missing_dims = set()
+        for i, (arg, iax, input_coredims, md) in enumerate(zip(
+                args, input_axes, self._input_coredimss, self._min_dims)):
+            shape = arg.shape
+            nds = len(shape)
+            # For the inputs that has missing dimensions we need to reshape
+            if nds < md:
+                raise ValueError(f'Input operand {i} does not have enough'
+                                 f' dimensions (has {nds}, gufunc core with'
+                                 f' signature {self._signature} requires {md}')
+            optionals = len(input_coredims) - nds
+            if optionals > 0:
+                # Look for optional dimensions
+                # We only allow the first or the last dimensions to be optional
+                if input_coredims[0][-1] == '?':
+                    shape = (1,) * optionals + shape
+                    missing_dims.update(set(input_coredims[:optionals]))
+                else:
+                    shape = shape + (1,) * optionals
+                    missing_dims.update(
+                        set(input_coredims[min(0, len(shape)-1):]))
+                arg = arg.reshape(shape)
+            transposed_args.append(self._transpose_element(arg, iax, shape))
+        args = transposed_args
+        if outs is not None:
+            transposed_outs = []
+            # outs should be transposed to the intermediate form before
+            # copying all results
+            for out, iox, coredims in zip(
+                    outs, output_axes, self._output_coredimss):
+                transposed_outs.append(self._transpose_element(
+                    out, iox, out.shape))
+            # check that outs has been correctly transposed
+            # if the function returns a scalar, outs will be ignored
+            if len(transposed_outs) == len(outs):
+                outs = transposed_outs
+        # we cant directly broadcast arrays together since their core dims
+        # might differ. Only the loop dimensions are broadcastable
+        shape = internal._broadcast_shapes(
+            [a.shape[:-len(self._input_coredimss)] for a in args])
+        args = [_manipulation.broadcast_to(
+            a, shape + a.shape[-len(self._input_coredimss):]) for a in args]
+        # Assess input args for loop dims
+        input_shapes = [a.shape for a in args]
+        num_loopdims = [
+            len(s) - len(cd) for s, cd in zip(
+                input_shapes, self._input_coredimss)
+        ]
+        max_loopdims = max(num_loopdims) if num_loopdims else None
+        core_input_shapes = [
+            dict(zip(icd, s[n:]))
+            for s, n, icd in zip(
+                input_shapes, num_loopdims, self._input_coredimss)
+        ]
+        core_shapes = {}
+        for d in core_input_shapes:
+            core_shapes.update(d)
+        loop_input_dimss = [
+            tuple(
+                '__loopdim%d__' % d for d in range(
+                    max_loopdims - n, max_loopdims)
+            )
+            for n in num_loopdims
+        ]
+        input_dimss = [li + c for li, c in zip(
+            loop_input_dimss, self._input_coredimss)]
+        loop_output_dims = max(loop_input_dimss, key=len, default=())
+        # Assess input args for same size and chunk sizes
+        # Collect sizes and chunksizes of all dims in all arrays
+        dimsizess = {}
+        for dims, shape in zip(input_dimss, input_shapes):
+            for dim, size in zip(dims, shape):
+                dimsizes = dimsizess.get(dim, [])
+                dimsizes.append(size)
+                dimsizess[dim] = dimsizes
+        # Assert correct partitioning, for case:
+        for dim, sizes in dimsizess.items():
+            if set(sizes).union({1}) != {1, max(sizes)}:
+                raise ValueError(
+                    f'Dimension {dim} with different lengths in arrays'
+                )
+        return args, dimsizess, loop_output_dims, outs, missing_dims
+    def _determine_order(self, args, order):
+        if order.upper() in ('C', 'K'):
+            # Order is determined to be C to allocate the out array
+            # but we will change the strides of the out array
+            # to be K later in __call__
+            return 'C'
+        elif order.upper() == 'A':
+            # order is F if all arrays are strictly F
+            order = ('F' if all([a.flags.f_contiguous
+                                 and not a.flags.c_contiguous
+                                 for a in args]) else 'C')
+            return order
+        elif order.upper() == 'F':
+            return 'F'
+        else:
+            raise RuntimeError(f'Unknown order {order}')
+    def __call__(self, *args, **kwargs):
+        '''
+        Apply a generalized ufunc.
+        Args:
+            args: Input arguments. Each of them can be a :class:`cupy.ndarray`
+                object or a scalar. The output arguments can be omitted or be
+                specified by the ``out`` argument.
+            axes (List of tuples of int, optional):
+                A list of tuples with indices of axes a generalized ufunc
+                should operate on.
+                For instance, for a signature of ``'(i,j),(j,k)->(i,k)'``
+                appropriate for matrix multiplication, the base elements are
+                two-dimensional matrices and these are taken to be stored in
+                the two last axes of each argument.  The corresponding
+                axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
+                For simplicity, for generalized ufuncs that operate on
+                1-dimensional arrays (vectors), a single integer is accepted
+                instead of a single-element tuple, and for generalized ufuncs
+                for which all outputs are scalars, the output tuples
+                can be omitted.
+            axis (int, optional):
+                A single axis over which a generalized ufunc should operate.
+                This is a short-cut for ufuncs that operate over a single,
+                shared core dimension, equivalent to passing in axes with
+                entries of (axis,) for each single-core-dimension argument
+                and ``()`` for all others.
+                For instance, for a signature ``'(i),(i)->()'``, it is
+                equivalent to passing in ``axes=[(axis,), (axis,), ()]``.
+            keepdims (bool, optional):
+                If this is set to True, axes which are reduced over will be
+                left in the result as a dimension with size one, so that the
+                result will broadcast correctly against the inputs. This
+                option can only be used for generalized ufuncs that operate
+                on inputs that all have the same number of core dimensions
+                and with outputs that have no core dimensions , i.e., with
+                signatures like ``'(i),(i)->()'`` or ``'(m,m)->()'``.
+                If used, the location of the dimensions in the output can
+                be controlled with axes and axis.
+            casting (str, optional):
+                Provides a policy for what kind of casting is permitted.
+                Defaults to ``'same_kind'``
+            dtype (dtype, optional):
+                Overrides the dtype of the calculation and output arrays.
+                Similar to signature.
+            signature (str or tuple of dtype, optional):
+                Either a data-type, a tuple of data-types, or a special
+                signature string indicating the input and output types of a
+                ufunc. This argument allows you to provide a specific
+                signature for the function to be used if registered in the
+                ``signatures`` kwarg of the ``__init__`` method.
+                If the loop specified does not exist for the ufunc, then
+                a TypeError is raised. Normally, a suitable loop is found
+                automatically by comparing the input types with what is
+                available and searching for a loop with data-types to
+                which all inputs can be cast safely. This keyword argument
+                lets you bypass that search and choose a particular loop.
+            order (str, optional):
+                Specifies the memory layout of the output array. Defaults to
+                ``'K'``.``'C'`` means the output should be C-contiguous,
+                ``'F'`` means F-contiguous, ``'A'`` means F-contiguous
+                if the inputs are F-contiguous and not also not C-contiguous,
+                C-contiguous otherwise, and ``'K'`` means to match the element
+                ordering of the inputs as closely as possible.
+            out (cupy.ndarray): Output array. It outputs to new arrays
+                default.
+        Returns:
+            Output array or a tuple of output arrays.
+        '''
+        #  This argument cannot be used for generalized ufuncs
+        #  as those take non-scalar input.
+        # where = kwargs.pop('where', None)
+        outs = kwargs.pop('out', None)
+        axes = kwargs.pop('axes', None)
+        axis = kwargs.pop('axis', None)
+        order = kwargs.pop('order', 'K')
+        dtype = kwargs.pop('dtype', None)
+        keepdims = kwargs.pop('keepdims', False)
+        signature = kwargs.pop('signature', None)
+        casting = kwargs.pop('casting', 'same_kind')
+        if len(kwargs) > 0:
+            raise RuntimeError(
+                'Unknown kwargs {}'.format(' '.join(kwargs.keys())))
+        ret_dtype = None
+        func = self._func
+        # this will cast the inputs appropiately
+        args, ret_dtype, func = self._ops_register.determine_dtype(
+            args, dtype, casting, signature)
+        if not type(self._signature) == str:
+            raise TypeError('`signature` has to be of type string')
+        if outs is not None and type(outs) != tuple:
+            if isinstance(outs, cupy.ndarray):
+                outs = (outs,)
+            else:
+                raise TypeError('`outs` must be a tuple or `cupy.ndarray`')
+        filter_order = self._determine_order(args, order)
+        input_coredimss = self._input_coredimss
+        output_coredimss = self._output_coredimss
+        if outs is not None and type(outs) != tuple:
+            raise TypeError('`outs` must be a tuple')
+        # Axes
+        input_axes, output_axes = _validate_normalize_axes(
+            axes, axis, keepdims, input_coredimss, output_coredimss
+        )
+        if len(input_coredimss) != len(args):
+            ValueError(
+                'According to `signature`, `func` requires %d arguments,'
+                ' but %s given' % (len(input_coredimss), len(args)))
+        args, dimsizess, loop_output_dims, outs, m_dims = self._get_args_transposed(  # NOQA
+            args, input_axes, outs, output_axes)
+        # The output shape varies depending on optional dims or not
+        # TODO(ecastill) this only works for one out argument
+        out_shape = [dimsizess[od][0] for od in loop_output_dims]
+        if self._nout > 0:
+            out_shape += [dimsizess[od][0] for od in output_coredimss[0]]
+        out_shape = tuple(out_shape)
+        if outs is None:
+            outs = cupy.empty(out_shape, dtype=ret_dtype, order=filter_order)
+            if order == 'K':
+                strides = internal._get_strides_for_order_K(
+                    outs, ret_dtype, out_shape)
+                outs._set_shape_and_strides(out_shape, strides, True, True)
+            outs = (outs,)
+        else:
+            if outs[0].shape != out_shape:
+                raise ValueError(f'Invalid shape for out {outs[0].shape}'
+                                 f' needs {out_shape}')
+            _raise_if_invalid_cast(
+                ret_dtype, outs[0].dtype, casting, "out dtype")
+        self._apply_func_to_inputs(
+            func, 0, dimsizess, loop_output_dims, args, outs)
+        # This code credit goes to Dask
+        # https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L462-L503
+        # Treat direct output
+        if self._nout == 0:
+            output_coredimss = [output_coredimss]
+        # Split output
+        # tmp might be a tuple of outs
+        # we changed the way we apply the function compared to dask
+        # we have added support for optional dims
+        leaf_arrs = []
+        for tmp in outs:
+            for i, (ocd, oax) in enumerate(zip(output_coredimss, output_axes)):
+                leaf_arr = tmp
+                # Axes:
+                if keepdims:
+                    slices = (len(leaf_arr.shape) * (slice(None),)
+                              + len(oax) * (numpy.newaxis,))
+                    leaf_arr = leaf_arr[slices]
+                tidcs = [None] * len(leaf_arr.shape)
+                for i, oa in zip(range(-len(oax), 0), oax):
+                    tidcs[oa] = i
+                j = 0
+                for i in range(len(tidcs)):
+                    if tidcs[i] is None:
+                        tidcs[i] = j
+                        j += 1
+                leaf_arr = leaf_arr.transpose(tidcs)
+                # Delete the dims that were optionals after the input expansion
+                if len(m_dims) > 0:
+                    shape = leaf_arr.shape
+                    # This line deletes the dimensions that were not present
+                    # in the input
+                    core_shape = shape[-len(ocd):]
+                    core_shape = tuple([
+                        d for d, n in zip(core_shape, ocd) if n not in m_dims])
+                    shape = shape[:-len(ocd)] + core_shape
+                    leaf_arr = leaf_arr.reshape(shape)
+                # leaf_arrs.append(leaf_arr.astype(leaf_arr.dtype, order=order))  # NOQA
+                leaf_arrs.append(leaf_arr)
+        return tuple(leaf_arrs) if self._nout > 1 else leaf_arrs[0]
--- a/cupy/_core/_kernel.pxd
+++ b/cupy/_core/_kernel.pxd
+from libcpp cimport vector
+from cupy._core cimport _carray
+from cupy._core cimport _scalar
+from cupy._core._carray cimport shape_t
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport memory
+from cupy.cuda cimport texture
+cdef class ParameterInfo:
+    cdef:
+        readonly str name
+        readonly object dtype
+        readonly str ctype
+        readonly bint raw
+        readonly bint is_const
+cdef enum _ArgKind:
+    ARG_KIND_NDARRAY = 1
+    ARG_KIND_INDEXER
+    ARG_KIND_SCALAR
+    ARG_KIND_POINTER
+    ARG_KIND_TEXTURE
+cdef class _ArgInfo:
+    # Holds metadata of an argument.
+    # This class is immutable and used as a part of hash keys.
+    cdef:
+        readonly _ArgKind arg_kind
+        readonly type type
+        readonly object dtype
+        readonly int ndim
+        readonly bint c_contiguous
+        readonly bint index_32_bits
+    cdef _ArgInfo _init(
+        self,
+        _ArgKind arg_kind,
+        type typ,
+        object dtype,
+        int ndim,
+        bint c_contiguous,
+        bint index_32_bits)
+    @staticmethod
+    cdef _ArgInfo from_arg(object arg)
+    @staticmethod
+    cdef _ArgInfo from_ndarray(_ndarray_base arg)
+    @staticmethod
+    cdef _ArgInfo from_scalar(_scalar.CScalar arg)
+    @staticmethod
+    cdef _ArgInfo from_indexer(_carray.Indexer arg)
+    @staticmethod
+    cdef _ArgInfo from_memptr(memory.MemoryPointer arg)
+    @staticmethod
+    cdef _ArgInfo from_texture(texture.TextureObject arg)
+    cdef _ArgInfo as_ndarray_with_ndim(self, int ndim)
+    cdef bint is_ndarray(self)
+    cdef bint is_scalar(self)
+    cdef str get_c_type(self)
+    cdef str get_param_c_type(self, ParameterInfo p)
+    cdef str get_c_var_name(self, ParameterInfo p)
+cdef class _TypeMap:
+    # Typedef mapping between C types.
+    # This class is immutable.
+    cdef:
+        tuple _pairs
+    cdef str get_typedef_code(self)
+cdef class _Op:
+    """Simple data structure that represents a kernel routine with single \
+concrete dtype mapping.
+    """
+    cdef:
+        readonly tuple in_types
+        readonly tuple out_types
+        readonly int nin
+        readonly int nout
+        readonly object routine
+        # If the type combination specified by in_types and out_types is
+        # disallowed, error_func must be set instead of routine.
+        # It's called by check_valid() method.
+        readonly object error_func
+    @staticmethod
+    cdef _Op _from_type_and_routine_or_error_func(
+        str typ, object routine, object error_func)
+    # Creates an op instance parsing a dtype mapping.
+    @staticmethod
+    cdef _Op from_type_and_routine(str typ, routine)
+    cpdef tuple get_in_dtypes(self)
+    cpdef tuple get_out_dtypes(self)
+    # Creates an op instance parsing a dtype mapping with given error function.
+    @staticmethod
+    cdef _Op from_type_and_error_func(str typ, error_func)
+    # Raises an error if error_func is given.
+    cdef check_valid(self)
+cdef class _Ops:
+    """A kernel routine representation with various dtype mappings.
+    """
+    cdef:
+        readonly tuple ops
+        readonly int nin
+        readonly int nout
+    @staticmethod
+    cdef _Ops from_tuples(object ops, routine)
+    # Queries a single op from input arguments.
+    cpdef _Op guess_routine(
+        self, str name, dict cache, list in_args, dtype, _Ops out_ops)
+    cpdef _Op _guess_routine_from_in_types(
+        self, tuple in_types, object can_cast=*)
+    cpdef _Op _guess_routine_from_dtype(self, object dtype)
+cpdef create_ufunc(name, ops, routine=*, preamble=*, doc=*,
+                   default_casting=*, loop_prep=*, out_ops=*,
+                   cutensor_op=*, scatter_op=*)
+cdef tuple _get_arginfos(list args)
+cdef str _get_kernel_params(tuple params, tuple arginfos)
+cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape)
+cdef list _get_out_args_from_optionals(
+    subtype, list out_args, tuple out_types, const shape_t& out_shape, casting,
+    obj)
+cdef list _get_out_args_with_params(
+    list out_args, tuple out_types,
+    const shape_t& out_shape, tuple out_params, bint is_size_specified)
+cdef _check_peer_access(_ndarray_base arr, int device_id)
+cdef list _preprocess_args(int dev_id, args, bint use_c_scalar)
+cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape)
--- a/cupy/_core/_kernel.pyx
+++ b/cupy/_core/_kernel.pyx
+import string
+import warnings
+import numpy
+import cupy
+from cupy.cuda import compiler
+from cupy import _util
+cimport cython  # NOQA
+from libcpp cimport vector
+from cupy.cuda cimport device
+from cupy.cuda cimport function
+from cupy.cuda cimport memory
+from cupy.cuda cimport texture
+from cupy._core cimport _accelerator
+from cupy._core cimport _carray
+from cupy._core cimport _scalar
+from cupy._core._dtype cimport get_dtype, _raise_if_invalid_cast
+from cupy._core._memory_range cimport may_share_bounds
+from cupy._core._scalar import get_typename as _get_typename
+from cupy._core cimport core
+from cupy._core.core cimport _convert_object_with_cuda_array_interface
+from cupy._core.core cimport _ndarray_init
+from cupy._core.core cimport compile_with_cache
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport internal
+from cupy_backends.cuda.api cimport runtime
+try:
+    import cupy_backends.cuda.libs.cutensor as cuda_cutensor
+except ImportError:
+    cuda_cutensor = None
+from cupy._core import _fusion_thread_local
+cdef inline bint _contains_zero(const shape_t& v) except? -1:
+    for i in range(v.size()):
+        if v[i] == 0:
+            return True
+    return False
+@_util.memoize(for_each_device=True)
+def _get_warpsize():
+    device_id = runtime.getDevice()
+    return runtime.getDeviceProperties(device_id)['warpSize']
+cdef str _get_simple_elementwise_kernel_code(
+        tuple params, tuple arginfos, str operation, str name,
+        _TypeMap type_map, str preamble, str loop_prep='', str after_loop=''):
+    # No loop unrolling due to avoid 64-bit division
+    module_code = string.Template('''
+    ${typedef_preamble}
+    ${preamble}
+    extern "C" __global__ void ${name}(${params}) {
+      ${loop_prep};
+      #pragma unroll 1
+      CUPY_FOR(i, _ind.size()) {
+        _ind.set(i);
+        ${operation};
+      }
+      ${after_loop};
+    }
+    ''').substitute(
+        typedef_preamble=type_map.get_typedef_code(),
+        params=_get_kernel_params(params, arginfos),
+        operation=operation,
+        name=name,
+        preamble=preamble,
+        loop_prep=loop_prep,
+        after_loop=after_loop)
+    return module_code
+cdef function.Function _get_simple_elementwise_kernel_from_code(
+        str name, str code, tuple options=()):
+    module = compile_with_cache(code, options)
+    return module.get_function(name)
+cdef function.Function _get_simple_elementwise_kernel(
+        tuple params, tuple arginfos, str operation, str name,
+        _TypeMap type_map, str preamble, str loop_prep='', str after_loop='',
+        tuple options=()):
+    code = _get_simple_elementwise_kernel_code(
+        params, arginfos, operation, name, type_map, preamble, loop_prep,
+        after_loop
+    )
+    return _get_simple_elementwise_kernel_from_code(name, code, options)
+cdef inline int _get_kind_score(int kind):
+    if b'b' == kind:
+        return 0
+    if b'u' == kind or b'i' == kind:
+        return 1
+    if b'f' == kind or b'c' == kind:
+        return 2
+    return -1
+@cython.profile(False)
+cdef inline _check_peer_access(_ndarray_base arr, int device_id):
+    if arr.data.device_id == device_id:
+        return
+    msg = (
+        f'The device where the array resides ({arr.data.device_id}) is '
+        f'different from the current device ({device_id}).'
+    )
+    cdef bint peer_access = device._enable_peer_access(
+        device_id, arr.data.device_id)
+    if not peer_access:
+        raise ValueError(
+            f'{msg} Peer access is unavailable between these devices.')
+    warnings.warn(
+        f'{msg} Peer access has been activated automatically.',
+        _util.PerformanceWarning)
+cdef inline _preprocess_arg(int dev_id, arg, bint use_c_scalar):
+    if isinstance(arg, _ndarray_base):
+        s = arg
+        _check_peer_access(<_ndarray_base>s, dev_id)
+    elif isinstance(arg, texture.TextureObject):
+        s = arg
+    elif hasattr(arg, '__cuda_array_interface__'):
+        s = _convert_object_with_cuda_array_interface(arg)
+        _check_peer_access(<_ndarray_base>s, dev_id)
+    elif hasattr(arg, '__cupy_get_ndarray__'):
+        s = arg.__cupy_get_ndarray__()
+        _check_peer_access(<_ndarray_base>s, dev_id)
+    else:  # scalars or invalid args
+        if use_c_scalar:
+            s = _scalar.scalar_to_c_scalar(arg)
+        else:
+            s = _scalar.scalar_to_numpy_scalar(arg)
+        if s is None:
+            raise TypeError('Unsupported type %s' % type(arg))
+    return s
+cdef list _preprocess_args(int dev_id, args, bint use_c_scalar):
+    """Preprocesses arguments for kernel invocation
+    - Checks device compatibility for ndarrays
+    - Converts Python/NumPy scalars:
+      - If use_c_scalar is True, into CScalars.
+      - If use_c_scalar is False, into NumPy scalars.
+    """
+    cdef list ret = []
+    for arg in args:
+        ret.append(_preprocess_arg(dev_id, arg, use_c_scalar))
+    return ret
+cdef list _preprocess_optional_args(int dev_id, args, bint use_c_scalar):
+    """Preprocesses arguments for kernel invocation
+    - Checks device compatibility for ndarrays
+    - Converts Python/NumPy scalars:
+      - If use_c_scalar is True, into CScalars.
+      - If use_c_scalar is False, into NumPy scalars.
+    """
+    cdef list ret = []
+    for arg in args:
+        if arg is None:
+            ret.append(None)
+        else:
+            ret.append(_preprocess_arg(dev_id, arg, use_c_scalar))
+    return ret
+cdef class _ArgInfo:
+    # Holds metadata of an argument.
+    # This class is immutable and used as a part of hash keys.
+    def __init__(self, *args):
+        arg_kind, typ, dtype, ndim, c_contiguous, index_32_bits = args
+        self._init(arg_kind, typ, dtype, ndim, c_contiguous, index_32_bits)
+    cdef _ArgInfo _init(
+            self,
+            _ArgKind arg_kind,
+            type typ,
+            object dtype,
+            int ndim,
+            bint c_contiguous,
+            bint index_32_bits):
+        self.arg_kind = arg_kind
+        self.type = typ
+        self.dtype = dtype
+        self.ndim = ndim
+        self.c_contiguous = c_contiguous
+        self.index_32_bits = index_32_bits
+    @staticmethod
+    cdef _ArgInfo from_arg(object arg):
+        typ = type(arg)
+        if issubclass(typ, _ndarray_base):
+            return _ArgInfo.from_ndarray(arg)
+        if typ is _scalar.CScalar:
+            return _ArgInfo.from_scalar(arg)
+        if typ is _carray.Indexer:
+            return _ArgInfo.from_indexer(arg)
+        if typ is memory.MemoryPointer:
+            return _ArgInfo.from_memptr(arg)
+        if typ is texture.TextureObject:
+            return _ArgInfo.from_texture(arg)
+        assert False, typ
+    @staticmethod
+    cdef _ArgInfo from_ndarray(_ndarray_base arg):
+        cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
+        ret._init(
+            ARG_KIND_NDARRAY,
+            type(arg),
+            arg.dtype.type,
+            arg._shape.size(),
+            arg._c_contiguous,
+            arg._index_32_bits)
+        return ret
+    @staticmethod
+    cdef _ArgInfo from_scalar(_scalar.CScalar arg):
+        cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
+        dtype = arg.get_numpy_type()
+        ret._init(ARG_KIND_SCALAR, _scalar.CScalar, dtype, 0, True, True)
+        return ret
+    @staticmethod
+    cdef _ArgInfo from_indexer(_carray.Indexer arg):
+        cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
+        ret._init(
+            ARG_KIND_INDEXER, _carray.Indexer, None, arg.ndim, True,
+            arg._index_32_bits)
+        return ret
+    @staticmethod
+    cdef _ArgInfo from_memptr(memory.MemoryPointer arg):
+        cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
+        ret._init(
+            ARG_KIND_POINTER, memory.MemoryPointer, None, 0, True, True)
+        return ret
+    @staticmethod
+    cdef _ArgInfo from_texture(texture.TextureObject arg):
+        cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo)
+        ret._init(
+            ARG_KIND_TEXTURE, texture.TextureObject, None, 0, True, True)
+        return ret
+    def __hash__(self):
+        return hash((self.arg_kind, self.type, self.dtype, self.ndim,
+                     self.c_contiguous, self.index_32_bits))
+    def __eq__(self, other):
+        cdef _ArgInfo oth
+        if not isinstance(other, _ArgInfo):
+            return False
+        oth = other
+        return (
+            self.arg_kind == oth.arg_kind
+            and self.type is oth.type
+            and self.dtype == oth.dtype
+            and self.ndim == oth.ndim
+            and self.c_contiguous == oth.c_contiguous
+            and self.index_32_bits == oth.index_32_bits)
+    def __repr__(self):
+        return '<_ArgInfo({})>'.format(
+            ' '.join([
+                'arg_kind={!r}'.format(self.arg_kind),
+                'type={!r}'.format(self.type),
+                'dtype={!r}'.format(self.dtype),
+                'ndim={!r}'.format(self.ndim),
+                'c_contiguous={!r}'.format(self.c_contiguous),
+                'index_32_bits={!r}'.format(self.index_32_bits),
+            ]))
+    cdef _ArgInfo as_ndarray_with_ndim(self, int ndim):
+        # Returns an ndarray _ArgInfo with altered ndim.
+        # If ndim is the same, self is returned untouched.
+        assert self.arg_kind == ARG_KIND_NDARRAY
+        if self.ndim == ndim:
+            return self
+        return _ArgInfo(
+            ARG_KIND_NDARRAY, self.dtype, self.dtype, ndim, False, False)
+    cdef bint is_ndarray(self):
+        return self.arg_kind == ARG_KIND_NDARRAY
+    cdef bint is_scalar(self):
+        return self.arg_kind == ARG_KIND_SCALAR
+    cdef str get_c_type(self):
+        # Returns the C type representation.
+        if self.arg_kind == ARG_KIND_NDARRAY:
+            return 'CArray<%s, %d, %d, %d>' % (
+                _get_typename(self.dtype), self.ndim,
+                self.c_contiguous, self.index_32_bits)
+        if self.arg_kind == ARG_KIND_SCALAR:
+            return _get_typename(self.dtype)
+        if self.arg_kind == ARG_KIND_INDEXER:
+            return 'CIndexer<%d, %d>' % (self.ndim, self.index_32_bits)
+        if self.arg_kind == ARG_KIND_TEXTURE:
+            return 'cudaTextureObject_t'
+        assert False
+    cdef str get_param_c_type(self, ParameterInfo p):
+        # Returns the C type representation in the global function's
+        # parameter list.
+        cdef str ctyp = self.get_c_type()
+        if p.is_const:
+            return 'const ' + ctyp
+        return ctyp
+    cdef str get_c_var_name(self, ParameterInfo p):
+        if self.arg_kind in (ARG_KIND_NDARRAY, ARG_KIND_POINTER) and not p.raw:
+            return '_raw_' + p.name
+        return p.name
+cdef tuple _get_arginfos(list args):
+    return tuple([_ArgInfo.from_arg(a) for a in args])
+cdef str _get_kernel_params(tuple params, tuple arginfos):
+    cdef ParameterInfo p
+    cdef _ArgInfo arginfo
+    assert len(params) == len(arginfos)
+    lst = []
+    for i in range(len(params)):
+        p = params[i]
+        arginfo = arginfos[i]
+        lst.append('{} {}'.format(
+            arginfo.get_param_c_type(p),
+            arginfo.get_c_var_name(p)))
+    return ', '.join(lst)
+cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape):
+    """ Remove contiguous stride to optimize CUDA kernel."""
+    cdef _ndarray_base arr
+    if shape.size() <= 1 or len(args) == 0:
+        return shape
+    if len(args) == 1:  # fast path for reduction
+        a = args[0]
+        if (<ParameterInfo>params[0]).raw or not isinstance(a, _ndarray_base):
+            return shape
+        arr = a
+        arr = arr.reduced_view()
+        if arr is a:
+            return shape
+        else:
+            args[0] = arr
+            return arr._shape
+    return _reduced_view_core(args, params, shape)
+cdef shape_t _reduced_view_core(list args, tuple params, const shape_t& shape):
+    cdef int i, ax, last_ax, ndim
+    cdef Py_ssize_t total_size
+    cdef shape_t vecshape, newshape, newstrides
+    cdef vector.vector[int] array_indexes, axes
+    cdef vector.vector[int] strides_indexes
+    cdef ParameterInfo p
+    cdef _ndarray_base arr
+    ndim = shape.size()
+    array_indexes.reserve(len(args))
+    strides_indexes.reserve(len(args))
+    for i in range(len(args)):
+        p = params[i]
+        if p.raw:
+            continue
+        a = args[i]
+        if isinstance(a, _ndarray_base):
+            array_indexes.push_back(i)
+            arr = a
+            if not arr._c_contiguous:
+                if ndim == 2:  # short cut
+                    return shape
+                strides_indexes.push_back(i)
+    if array_indexes.size() == 0:
+        return shape
+    if strides_indexes.size() == 0:
+        # The input arrays are all c_contiguous
+        i = array_indexes[0]
+        arr = args[i]
+        total_size = arr.size
+        newshape.assign(<Py_ssize_t>1, total_size)
+        newstrides.resize(1)
+        for i in array_indexes:
+            arr = args[i]
+            newstrides[0] = arr.dtype.itemsize
+            # TODO(niboshi): Confirm update_x_contiguity flags
+            args[i] = arr._view(
+                type(arr), newshape, newstrides, False, True, arr)
+        return newshape
+    axes.reserve(ndim)
+    vecshape.reserve(ndim)
+    for ax in range(ndim):
+        vecshape.push_back(shape[ax])
+    last_ax = -1
+    for ax in range(ndim):
+        if vecshape[ax] == 1:
+            continue
+        if last_ax < 0:
+            last_ax = ax
+            continue
+        for i in strides_indexes:
+            arr = args[i]
+            if arr._strides[ax] * vecshape[ax] != arr._strides[last_ax]:
+                axes.push_back(last_ax)
+                break
+        else:
+            vecshape[ax] *= vecshape[last_ax]
+        last_ax = ax
+    if last_ax >= 0:
+        axes.push_back(last_ax)
+    if <int>axes.size() == ndim:
+        return shape
+    newshape.reserve(axes.size())
+    newstrides.reserve(axes.size())
+    for ax in axes:
+        newshape.push_back(vecshape[ax])
+    for i in array_indexes:
+        arr = args[i]
+        newstrides.clear()
+        for ax in axes:
+            newstrides.push_back(arr._strides[ax])
+        # TODO(niboshi): Confirm update_x_contiguity flags
+        args[i] = arr._view(type(arr), newshape, newstrides, False, True, arr)
+    return newshape
+cdef class ParameterInfo:
+    def __init__(self, str param, bint is_const):
+        self.name = None
+        self.dtype = None
+        self.ctype = None
+        self.raw = False
+        self.is_const = is_const
+        s = tuple([i for i in param.split() if len(i) != 0])
+        if len(s) < 2:
+            raise Exception('Syntax error: %s' % param)
+        t, self.name = s[-2:]
+        if t == 'CIndexer':
+            pass
+        elif len(t) == 1:
+            self.ctype = t
+        else:
+            dtype = get_dtype(t)
+            self.dtype = dtype.type
+            if dtype.name != t:
+                raise ValueError('Wrong type %s' % t)
+            self.ctype = _get_typename(self.dtype)
+        for i in s[:-2]:
+            if i == 'raw':
+                self.raw = True
+            elif i == '_non_const':
+                self.is_const = False
+            else:
+                raise Exception('Unknown keyword "%s"' % i)
+    def __hash__(self):
+        return hash((
+            self.name, self.dtype, self.ctype, self.raw, self.is_const))
+    def __eq__(self, other):
+        cdef ParameterInfo oth
+        if not isinstance(other, ParameterInfo):
+            return False
+        oth = other
+        return (
+            self.name == oth.name
+            and self.dtype == oth.dtype
+            and self.ctype == oth.ctype
+            and self.raw == oth.raw
+            and self.is_const == oth.is_const)
+    def __repr__(self):
+        return '<ParameterInfo({})>'.format(
+            ' '.join([
+                'name={!r}'.format(self.name),
+                'dtype={!r}'.format(self.dtype),
+                'ctype={!r}'.format(self.ctype),
+                'raw={!r}'.format(self.raw),
+                'is_const={!r}'.format(self.is_const),
+            ]))
+@_util.memoize()
+def _get_param_info(str s, is_const):
+    if len(s) == 0:
+        return ()
+    return tuple([ParameterInfo(i, is_const) for i in s.strip().split(',')])
+@_util.memoize()
+def _decide_params_type(in_params, out_params, in_args_dtype, out_args_dtype):
+    return _decide_params_type_core(in_params, out_params, in_args_dtype,
+                                    out_args_dtype)
+cdef class _TypeMap:
+    def __init__(self, pairs):
+        self._pairs = pairs
+    def __hash__(self):
+        return hash(self._pairs)
+    def __eq__(self, other):
+        if not isinstance(other, _TypeMap):
+            return False
+        return self._pairs == (<_TypeMap>other)._pairs
+    def __str__(self):
+        return '<_TypeMap {}>'.format(self._pairs)
+    cdef str get_typedef_code(self):
+        # Returns a code fragment of typedef statements used as preamble.
+        return ''.join([
+            'typedef %s %s;\n' % (_get_typename(ctype2), ctype1)
+            for ctype1, ctype2 in self._pairs])
+cdef tuple _decide_params_type_core(
+        tuple in_params, tuple out_params, tuple in_args_dtype,
+        tuple out_args_dtype):
+    type_dict = {}
+    if out_args_dtype:
+        assert len(out_params) == len(out_args_dtype)
+        for p, a in zip(out_params, out_args_dtype):
+            if a is None:
+                raise TypeError('Output arguments must be cupy.ndarray')
+            if p.dtype is not None:
+                if get_dtype(a) != get_dtype(p.dtype):
+                    raise TypeError(
+                        'Type is mismatched. %s %s %s' % (p.name, a, p.dtype))
+            elif p.ctype in type_dict:
+                t = type_dict[p.ctype]
+                if get_dtype(t) != get_dtype(a):
+                    raise TypeError(
+                        'Type is mismatched. %s %s %s %s' % (
+                            p.name, a, t, p.ctype))
+            else:
+                type_dict[p.ctype] = a
+    assert len(in_params) == len(in_args_dtype)
+    unknown_ctype = []  # TODO(leofang): remove this as it's unused?
+    for p, a in zip(in_params, in_args_dtype):
+        if a is None:
+            if p.dtype is None:
+                unknown_ctype.append(p.ctype)
+        else:
+            if p.dtype is not None:
+                if numpy.dtype(a) != numpy.dtype(p.dtype):
+                    raise TypeError(
+                        'Type is mismatched. %s %s %s' % (p.name, a, p.dtype))
+            elif p.ctype in type_dict:
+                t = type_dict[p.ctype]
+                if numpy.dtype(t) != numpy.dtype(a):
+                    raise TypeError(
+                        'Type is mismatched. %s %s %s %s' % (
+                            p.name, a, t, p.ctype))
+            else:
+                type_dict[p.ctype] = a
+    in_types = tuple([type_dict[p.ctype] if p.dtype is None else p.dtype
+                      for p in in_params])
+    out_types = tuple([type_dict[p.ctype] if p.dtype is None else p.dtype
+                       for p in out_params])
+    type_map = _TypeMap(tuple(sorted(type_dict.items())))
+    return in_types, out_types, type_map
+cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape):
+    # `shape` is an output argument
+    cdef Py_ssize_t i
+    cdef ParameterInfo p
+    cdef bint any_nonraw_array = False
+    # Collect non-raw arrays
+    value = []
+    for i, a in enumerate(args):
+        p = params[i]
+        if not p.raw and isinstance(a, _ndarray_base):
+            # Non-raw array
+            any_nonraw_array = True
+            value.append(a)
+        else:
+            value.append(None)
+    if use_size:
+        if any_nonraw_array:
+            raise ValueError('Specified \'size\' can be used only '
+                             'if all of the ndarray are \'raw\'.')
+    else:
+        if not any_nonraw_array:
+            raise ValueError('Loop size is undecided.')
+    # Perform broadcast.
+    # Note that arrays in `value` are replaced with broadcasted ones.
+    internal._broadcast_core(value, shape)
+    # Restore raw arrays and scalars from the original list.
+    for i, a in enumerate(value):
+        if a is None:
+            value[i] = args[i]
+    return value
+cdef _numpy_can_cast = numpy.can_cast
+cdef list _get_out_args_from_optionals(
+    subtype, list out_args, tuple out_types, const shape_t& out_shape, casting,
+    obj
+):
+    cdef _ndarray_base arr
+    while len(out_args) < len(out_types):
+        out_args.append(None)
+    for i, a in enumerate(out_args):
+        if a is None:
+            out_args[i] = _ndarray_init(
+                subtype, out_shape, out_types[i], obj)
+            continue
+        if not isinstance(a, _ndarray_base):
+            raise TypeError(
+                'Output arguments type must be cupy.ndarray')
+        arr = a
+        if not internal.vector_equal(arr._shape, out_shape):
+            raise ValueError('Out shape is mismatched')
+        out_type = get_dtype(out_types[i])
+        _raise_if_invalid_cast(out_type, arr.dtype, casting, "output operand")
+    return out_args
+cdef _copy_in_args_if_needed(list in_args, list out_args):
+    # `in_args` is an input and output argument
+    cdef _ndarray_base inp, out
+    for i in range(len(in_args)):
+        a = in_args[i]
+        if isinstance(a, _ndarray_base):
+            inp = a
+            for out in out_args:
+                if inp is not out and may_share_bounds(inp, out):
+                    in_args[i] = inp.copy()
+                    break
+cdef list _get_out_args_with_params(
+        list out_args, tuple out_types, const shape_t& out_shape,
+        tuple out_params, bint is_size_specified):
+    cdef ParameterInfo p
+    cdef _ndarray_base arr
+    if not out_args:
+        for p in out_params:
+            if p.raw and not is_size_specified:
+                raise ValueError('Output array size is Undecided')
+        return [_ndarray_init(
+            cupy.ndarray, out_shape, t, None) for t in out_types]
+    for i, p in enumerate(out_params):
+        a = out_args[i]
+        if not isinstance(a, _ndarray_base):
+            raise TypeError(
+                'Output arguments type must be cupy.ndarray')
+        arr = a
+        if not p.raw and not internal.vector_equal(arr._shape, out_shape):
+            raise ValueError('Out shape is mismatched')
+    return out_args
+@_util.memoize()
+def _get_elementwise_kernel_code(
+        tuple arginfos, _TypeMap type_map,
+        tuple params, str operation, str name,
+        str preamble, str loop_prep='', str after_loop='', tuple options=()):
+    cdef _ArgInfo arginfo
+    op = []
+    for p, arginfo in zip(params, arginfos):
+        if arginfo.is_ndarray() and not p.raw:
+            if p.is_const:
+                fmt = 'const {t} &{n} = _raw_{n}[_ind.get()];'
+            else:
+                fmt = '{t} &{n} = _raw_{n}[_ind.get()];'
+            op.append(fmt.format(t=p.ctype, n=p.name))
+    op.append(operation)
+    operation = '\n'.join(op)
+    return _get_simple_elementwise_kernel_code(
+        params, arginfos, operation, name, type_map,
+        preamble, loop_prep, after_loop)
+@_util.memoize(for_each_device=True)
+def _get_elementwise_kernel(
+        tuple arginfos, _TypeMap type_map,
+        tuple params, str operation, str name,
+        str preamble, str loop_prep='', str after_loop='', tuple options=()):
+    cdef str code = _get_elementwise_kernel_code(
+        arginfos, type_map, params, operation, name, preamble, loop_prep,
+        after_loop
+    )
+    return _get_simple_elementwise_kernel_from_code(name, code, options)
+cdef class ElementwiseKernel:
+    """User-defined elementwise kernel.
+    This class can be used to define an elementwise kernel with or without
+    broadcasting.
+    The kernel is compiled at an invocation of the
+    :meth:`~ElementwiseKernel.__call__` method,
+    which is cached for each device.
+    The compiled binary is also cached into a file under the
+    ``$HOME/.cupy/kernel_cache/`` directory with a hashed file name. The cached
+    binary is reused by other processes.
+    Args:
+        in_params (str): Input argument list.
+        out_params (str): Output argument list.
+        operation (str): The body in the loop written in CUDA-C/C++.
+        name (str): Name of the kernel function. It should be set for
+            readability of the performance profiling.
+        reduce_dims (bool): If ``False``, the shapes of array arguments are
+            kept within the kernel invocation. The shapes are reduced
+            (i.e., the arrays are reshaped without copy to the minimum
+            dimension) by default. It may make the kernel fast by reducing the
+            index calculations.
+        options (tuple): Compile options passed to NVRTC. For details, see
+            https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.
+        preamble (str): Fragment of the CUDA-C/C++ code that is inserted at the
+            top of the cu file.
+        no_return (bool): If ``True``, __call__ returns ``None``.
+        return_tuple (bool): If ``True``, __call__ always returns tuple of
+            array even if single value is returned.
+        loop_prep (str): Fragment of the CUDA-C/C++ code that is inserted at
+            the top of the kernel function definition and above the ``for``
+            loop.
+        after_loop (str): Fragment of the CUDA-C/C++ code that is inserted at
+            the bottom of the kernel function definition.
+    """
+    cdef:
+        readonly tuple in_params
+        readonly tuple out_params
+        readonly Py_ssize_t nin
+        readonly Py_ssize_t nout
+        readonly Py_ssize_t nargs
+        readonly tuple params
+        readonly object operation
+        readonly str name
+        readonly str __name__
+        readonly bint reduce_dims
+        readonly object preamble
+        readonly bint no_return
+        readonly bint return_tuple
+        readonly dict kwargs
+        readonly dict _params_type_memo
+        readonly dict _elementwise_kernel_memo
+        readonly dict _cached_codes
+    def __init__(self, in_params, out_params, operation,
+                 name='kernel', reduce_dims=True, preamble='',
+                 no_return=False, return_tuple=False, **kwargs):
+        if not compiler.is_valid_kernel_name(name):
+            raise ValueError(
+                'Invalid kernel name: "%s"' % name)
+        self.in_params = _get_param_info(in_params, True)
+        self.out_params = _get_param_info(out_params, False)
+        self.nin = len(self.in_params)
+        self.nout = len(self.out_params)
+        self.nargs = self.nin + self.nout
+        param_rest = _get_param_info('CIndexer _ind', False)
+        self.params = self.in_params + self.out_params + param_rest
+        self.operation = operation
+        self.name = name
+        self.reduce_dims = reduce_dims
+        self.preamble = preamble
+        self.no_return = no_return
+        self.return_tuple = return_tuple
+        self.kwargs = kwargs
+        self._params_type_memo = {}
+        self._cached_codes = {}
+        names = [p.name for p in self.in_params + self.out_params]
+        if 'i' in names:
+            raise ValueError('Can not use \'i\' as a parameter name')
+        self._elementwise_kernel_memo = {}
+        # This is for profiling mechanisms to auto infer a name
+        self.__name__ = name
+    def __call__(self, *args, **kwargs):
+        """Compiles and invokes the elementwise kernel.
+        The compilation runs only if the kernel is not cached. Note that the
+        kernels with different argument dtypes or dimensions are not
+        compatible. It means that single ElementwiseKernel object may be
+        compiled into multiple kernel binaries.
+        Args:
+            args: Arguments of the kernel.
+            size (int): Range size of the indices.  By default, the range size
+                is automatically determined from the result of broadcasting.
+                This parameter must be specified if and only if all ndarrays
+                are `raw` and the range size cannot be determined
+                automatically.
+            block_size (int): Number of threads per block. By default, the
+                value is set to 128.
+        Returns:
+            If ``no_return`` has not set, arrays are returned according to the
+            ``out_params`` argument of the ``__init__`` method.
+            If ``no_return`` has set, ``None`` is returned.
+        """
+        cdef function.Function kern
+        cdef Py_ssize_t size, i
+        cdef list in_args, out_args
+        cdef tuple in_types, out_types
+        cdef shape_t shape
+        size = kwargs.pop('size', -1)
+        stream = kwargs.pop('stream', None)
+        block_size = kwargs.pop('block_size', 128)
+        if len(kwargs):
+            raise TypeError('Wrong arguments %s' % kwargs)
+        if block_size <= 0:
+            raise ValueError('block_size must be greater than zero')
+        n_args = len(args)
+        if n_args != self.nin and n_args != self.nargs:
+            raise TypeError(
+                'Wrong number of arguments for {!r}. '
+                'It must be either {} or {} (with outputs), '
+                'but given {}.'.format(
+                    self.name, self.nin, self.nargs, n_args))
+        for arg in args:
+            if hasattr(arg, '__cupy_override_elementwise_kernel__'):
+                return arg.__cupy_override_elementwise_kernel__(
+                    self, *args, **kwargs)
+        dev_id = device.get_device_id()
+        arg_list = _preprocess_args(dev_id, args, True)
+        out_args = arg_list[self.nin:]
+        # _broadcast updates shape
+        in_args = _broadcast(
+            arg_list, self.params, size != -1, shape)[:self.nin]
+        in_ndarray_types = []
+        for a in in_args:
+            if isinstance(a, _ndarray_base):
+                t = a.dtype.type
+            elif isinstance(a, texture.TextureObject):
+                t = 'cudaTextureObject_t'
+            else:
+                t = None
+            in_ndarray_types.append(t)
+        in_ndarray_types = tuple(in_ndarray_types)
+        out_ndarray_types = tuple([a.dtype.type for a in out_args])
+        in_types, out_types, type_map = self._decide_params_type(
+            in_ndarray_types, out_ndarray_types)
+        is_size_specified = False
+        if size != -1:
+            shape.assign(1, size)
+            is_size_specified = True
+        out_args = _get_out_args_with_params(
+            out_args, out_types, shape, self.out_params, is_size_specified)
+        if self.no_return:
+            ret = None
+        elif not self.return_tuple and self.nout == 1:
+            ret = out_args[0]
+        else:
+            ret = tuple(out_args)
+        if _contains_zero(shape):
+            return ret
+        for i, x in enumerate(in_args):
+            if type(x) is _scalar.CScalar:
+                (<_scalar.CScalar>x).apply_dtype(in_types[i])
+        inout_args = in_args + out_args
+        if self.reduce_dims:
+            shape = _reduce_dims(inout_args, self.params, shape)
+        indexer = _carray._indexer_init(shape)
+        inout_args.append(indexer)
+        arginfos = _get_arginfos(inout_args)
+        kern = self._get_elementwise_kernel(dev_id, arginfos, type_map)
+        kern.linear_launch(indexer.size, inout_args, shared_mem=0,
+                           block_max_size=block_size, stream=stream)
+        return ret
+    cpdef tuple _decide_params_type(
+            self, tuple in_args_dtype, tuple out_args_dtype):
+        key = (in_args_dtype, out_args_dtype)
+        ret = self._params_type_memo.get(key, None)
+        if ret is not None:
+            return ret
+        ret = _decide_params_type_core(
+            self.in_params, self.out_params, in_args_dtype, out_args_dtype)
+        self._params_type_memo[key] = ret
+        return ret
+    cpdef function.Function _get_elementwise_kernel(
+            self, int dev_id, tuple arginfos, _TypeMap type_map):
+        key = (
+            dev_id,
+            arginfos,
+            type_map)
+        kern = self._elementwise_kernel_memo.get(key, None)
+        if kern is not None:
+            return kern
+        kern = _get_elementwise_kernel(
+            arginfos, type_map, self.params, self.operation,
+            self.name, self.preamble, **self.kwargs)
+        # Store the compiled kernel in the cache.
+        # Potentially overwrite a duplicate cache entry because
+        # _get_elementwise_kernel() may include IO wait.
+        in_types = []
+        for x in arginfos:
+            if x.type is cupy.ndarray:
+                in_types.append(cupy.dtype(x.dtype).char)
+        in_types = tuple(in_types)
+        if in_types not in self._cached_codes:
+            code = _get_elementwise_kernel_code(
+                arginfos, type_map, self.params, self.operation,
+                self.name, self.preamble, **self.kwargs)
+            self._cached_codes[in_types] = code
+        self._elementwise_kernel_memo[key] = kern
+        return kern
+    @property
+    def cached_codes(self):
+        """Returns a dict that has input types as keys and codes values.
+        This proprety method is for debugging purpose.
+        The return value is not guaranteed to keep backward compatibility.
+        """
+        if len(self._cached_codes) == 0:
+            warnings.warn(
+                'No codes are cached because compilation is deferred until '
+                'the first function call.')
+        return dict([(k, v) for k, v in self._cached_codes.items()])
+    @property
+    def cached_code(self):
+        """Returns `next(iter(self.cached_codes.values()))`.
+        This proprety method is for debugging purpose.
+        The return value is not guaranteed to keep backward compatibility.
+        """
+        codes = self._cached_codes
+        if len(codes) > 1:
+            warnings.warn(
+                'The input types of the kernel could not be inferred. '
+                'Please use `.cached_codes` instead.')
+        return next(iter(codes.values()))
+cdef str fix_cast_expr(src_type, dst_type, str expr):
+    src_kind = get_dtype(src_type).kind
+    dst_kind = get_dtype(dst_type).kind
+    if src_kind == dst_kind:
+        return expr
+    if src_kind == 'b':
+        # HIP has an issue with bool conversions detailed below
+        if runtime._is_hip_environment:
+            return f'_hip_bool_cast({expr})'
+        else:
+            return f'({expr}) ? 1 : 0'
+    if src_kind == 'c':
+        if dst_kind == 'b':
+            return f'({expr}) != {_scalar.get_typename(src_type)}()'
+        else:  # dst_kind in 'iuf' (int, uint, float)
+            return f'({expr}).real()'
+    return expr
+cdef function.Function _get_ufunc_kernel(
+        tuple in_types, tuple out_types, routine, tuple arginfos,
+        bint has_where, params,
+        name, preamble, loop_prep):
+    cdef _ArgInfo arginfo
+    cdef str str_type, str_var
+    offset_where = len(in_types)
+    offset_out = offset_where
+    if has_where:
+        offset_out += 1
+    types = []
+    op = []
+    if has_where:
+        arginfo = arginfos[offset_where]
+        if arginfo.is_ndarray():
+            op.append('if(!_raw__where[_ind.get()]) continue;')
+        else:
+            op.append('if(!_where) continue;')
+    for i, x in enumerate(in_types):
+        str_var = 'in%d' % i
+        str_type = str_var + '_type'
+        types.append((str_type, x))
+        arginfo = arginfos[i]
+        if arginfo.is_ndarray():
+            op.append('const {} {}({});'.format(
+                str_type,
+                str_var,
+                fix_cast_expr(arginfo.dtype, x, f'_raw_{str_var}[_ind.get()]')
+            ))
+    out_op = []
+    for i, x in enumerate(out_types):
+        str_var = 'out%d' % i
+        str_type = str_var + '_type'
+        types.append((str_type, x))
+        arginfo = arginfos[i + offset_out]
+        op.append(f'{str_type} {str_var};')
+        out_op.append('{} = {};'.format(
+            f'_raw_{str_var}[_ind.get()]',
+            fix_cast_expr(x, arginfo.dtype, str_var)
+        ))
+    type_map = _TypeMap(tuple(types))
+    op.append(routine)
+    op.append(';')
+    op.extend(out_op)
+    operation = '\n'.join(op)
+    # HIP/ROCm 4.3 has an issue with ifs and ternary operators
+    #
+    # int bool(int x) {
+    #     if (x != 0) return 1;
+    #     return 0;
+    # }
+    #
+    # bool(5) == 1;  //false
+    # bool(5) == 5;  //true
+    #
+    # also it simplifies  (a ? 1 : 0)  directly to a, and yields
+    # an incorrect value
+    if runtime._is_hip_environment:
+        preamble += """
+        __device__ int _hip_bool_cast(long long int x) {
+            volatile int a = 1;
+            if (x == 0) a = 0;
+            return a;
+        }
+        """
+    return _get_simple_elementwise_kernel(
+        params, arginfos, operation, name, type_map, preamble,
+        loop_prep=loop_prep)
+cdef inline bint _check_should_use_min_scalar(list in_args) except? -1:
+    cdef int kind, max_array_kind, max_scalar_kind
+    cdef bint all_scalars
+    all_scalars = True
+    max_array_kind = -1
+    max_scalar_kind = -1
+    for i in in_args:
+        kind = _get_kind_score(ord(i.dtype.kind))
+        if isinstance(i, _ndarray_base):
+            all_scalars = False
+            max_array_kind = max(max_array_kind, kind)
+        else:
+            max_scalar_kind = max(max_scalar_kind, kind)
+    return (max_scalar_kind != -1 and
+            not all_scalars and
+            max_array_kind >= max_scalar_kind)
+cdef dict _mst_unsigned_to_signed = {
+    i: (numpy.iinfo(j).max, (i, j))
+    for i, j in [(numpy.dtype(i).type, numpy.dtype(i.lower()).type)
+                 for i in "BHILQ"]}
+cdef _numpy_min_scalar_type = numpy.min_scalar_type
+cdef _min_scalar_type(x):
+    # A non-negative integer may have two locally minimum scalar
+    # types: signed/unsigned integer.
+    # Return both for can_cast, while numpy.min_scalar_type only returns
+    # the unsigned type.
+    t = _numpy_min_scalar_type(x)
+    dt = t.type
+    if t.kind == 'u':
+        m, dt2 = <tuple>_mst_unsigned_to_signed[dt]
+        if x <= m:
+            return dt2
+    return dt
+cdef class ufunc:
+    """Universal function.
+    Attributes:
+        ~ufunc.name (str): The name of the universal function.
+        ~ufunc.nin (int): Number of input arguments.
+        ~ufunc.nout (int): Number of output arguments.
+        ~ufunc.nargs (int): Number of all arguments.
+    """
+    cdef:
+        readonly Py_ssize_t nin
+        readonly Py_ssize_t nout
+        readonly Py_ssize_t nargs
+        readonly object name
+        readonly _Ops _ops  # normal routines
+        # routines based on explicitly given output dtype
+        readonly _Ops _out_ops
+        readonly object _preamble
+        readonly object _loop_prep
+        readonly object _default_casting
+        readonly object _cutensor_op
+        readonly int _cutensor_alpha
+        readonly int _cutensor_gamma
+        readonly str _scatter_op
+        readonly tuple _params
+        readonly tuple _params_with_where
+        readonly dict _routine_cache
+        readonly dict _kernel_memo
+        readonly object __doc__
+        readonly object __name__
+        readonly object __module__
+    def __init__(
+            self, name, nin, nout, _Ops ops, preamble='', loop_prep='', doc='',
+            default_casting=None, *, _Ops out_ops=None, cutensor_op=None,
+            scatter_op=None):
+        self.name = name
+        self.__name__ = name
+        self.nin = nin
+        self.nout = nout
+        self.nargs = nin + nout
+        self._ops = ops
+        self._out_ops = out_ops
+        self._preamble = preamble
+        self._loop_prep = loop_prep
+        self.__doc__ = doc
+        if default_casting is None:
+            self._default_casting = 'same_kind'
+        else:
+            self._default_casting = default_casting
+        if cutensor_op is not None and cuda_cutensor is not None:
+            self._cutensor_op, self._cutensor_alpha, self._cutensor_gamma = (
+                getattr(cuda_cutensor, cutensor_op[0]),
+                cutensor_op[1], cutensor_op[2])
+        self._scatter_op = scatter_op
+        _in_params = tuple(
+            ParameterInfo('T in%d' % i, True)
+            for i in range(nin))
+        _out_params = tuple(
+            ParameterInfo('T out%d' % i, False)
+            for i in range(nout))
+        _other_params = (
+            ParameterInfo('CIndexer _ind', False),)
+        self._params = _in_params + _out_params + _other_params
+        self._params_with_where = (
+            _in_params + (ParameterInfo('T _where', False),)
+            + _out_params + _other_params)
+        self._routine_cache = {}
+        self._kernel_memo = {}
+    def __repr__(self):
+        return '<ufunc \'%s\'>' % self.name
+    @property
+    def types(self):
+        """A list of type signatures.
+        Each type signature is represented by type character codes of inputs
+        and outputs separated by '->'.
+        """
+        types = []
+        for op in self._ops.ops:
+            in_str = ''.join([<str>get_dtype(t).char for t in op.in_types])
+            out_str = ''.join([<str>get_dtype(t).char for t in op.out_types])
+            types.append('%s->%s' % (in_str, out_str))
+        return types
+    def __call__(self, *args, **kwargs):
+        """Applies the universal function to arguments elementwise.
+        Args:
+            args: Input arguments. Each of them can be a :class:`cupy.ndarray`
+                object or a scalar. The output arguments can be omitted or be
+                specified by the ``out`` argument.
+            out (cupy.ndarray): Output array. It outputs to new arrays
+                default.
+            dtype: Data type specifier.
+        Returns:
+            Output array or a tuple of output arrays.
+        """
+        for arg in args:
+            if hasattr(arg, '__cupy_override_elementwise_kernel__'):
+                return arg.__cupy_override_elementwise_kernel__(
+                    self, *args, **kwargs)
+        if _fusion_thread_local.is_fusing():
+            return _fusion_thread_local.call_ufunc(self, *args, **kwargs)
+        cdef function.Function kern
+        cdef list broad_values
+        cdef shape_t shape
+        out = kwargs.pop('out', None)
+        where = kwargs.pop('_where', None)
+        cdef bint has_where = where is not None
+        dtype = kwargs.pop('dtype', None)
+        # Note default behavior of casting is 'same_kind' on numpy>=1.10
+        casting = kwargs.pop('casting', self._default_casting)
+        if dtype is not None:
+            dtype = get_dtype(dtype).type
+        if kwargs:
+            raise TypeError('Wrong arguments %s' % kwargs)
+        n_args = len(args)
+        if not (self.nin <= n_args <= self.nargs):
+            # TODO(kataoka): Fix error message for nout >= 2 (e.g. divmod)
+            raise TypeError(
+                'Wrong number of arguments for {!r}. '
+                'It must be either {} or {} (with outputs), '
+                'but given {}.'.format(
+                    self.name, self.nin, self.nargs, n_args))
+        # parse inputs (positional) and outputs (positional or keyword)
+        in_args = args[:self.nin]
+        out_args = args[self.nin:]
+        if out is not None:
+            if out_args:
+                raise ValueError('Cannot specify \'out\' as both '
+                                 'a positional and keyword argument')
+            if isinstance(out, tuple):
+                if len(out) != self.nout:
+                    raise ValueError(
+                        "The 'out' tuple must have exactly one entry per "
+                        "ufunc output")
+                out_args = out
+            else:
+                if 1 != self.nout:
+                    raise ValueError("'out' must be a tuple of arrays")
+                out_args = out,
+        dev_id = device.get_device_id()
+        in_args = _preprocess_args(dev_id, in_args, False)
+        out_args = _preprocess_optional_args(dev_id, out_args, False)
+        given_out_args = [o for o in out_args if o is not None]
+        # TODO(kataoka): Typecheck `in_args` w.r.t. `casting` (before
+        # broadcast).
+        if has_where:
+            where_args = _preprocess_args(dev_id, (where,), False)
+            x = where_args[0]
+            if isinstance(x, _ndarray_base):
+                # NumPy seems using casting=safe here
+                if x.dtype != bool:
+                    raise TypeError(
+                        f'Cannot cast array data from {x.dtype!r} to '
+                        f'{get_dtype(bool)!r} according to the rule \'safe\'')
+            else:
+                # NumPy does not seem raising TypeError.
+                # CuPy does not have to support `where=object()` etc. and
+                # `_preprocess_args` rejects it anyway.
+                where_args[0] = _scalar.CScalar.from_numpy_scalar_with_dtype(
+                    x, numpy.bool_)
+        else:
+            where_args = []
+        # _copy_in_args_if_needed updates in_args
+        _copy_in_args_if_needed(in_args, given_out_args)
+        _copy_in_args_if_needed(where_args, given_out_args)
+        broad_values = in_args + where_args + given_out_args
+        # _broadcast updates shape
+        internal._broadcast_core(broad_values, shape)
+        if (self._cutensor_op is not None
+                and _accelerator.ACCELERATOR_CUTENSOR in
+                _accelerator._elementwise_accelerators):
+            if (self.nin == 2 and self.nout == 1 and
+                    isinstance(in_args[0], _ndarray_base) and
+                    isinstance(in_args[1], _ndarray_base)):
+                import cupyx.cutensor
+                ret = cupyx.cutensor._try_elementwise_binary_routine(
+                    in_args[0], in_args[1], dtype,
+                    out_args[0] if len(out_args) == 1 else None,
+                    self._cutensor_op,
+                    self._cutensor_alpha,
+                    self._cutensor_gamma,
+                )
+                if ret is not None:
+                    return ret
+        op = self._ops.guess_routine(
+            self.name, self._routine_cache, in_args, dtype, self._out_ops)
+        # Determine a template object from which we initialize the output when
+        # inputs have subclass instances
+        def issubclass1(cls, classinfo):
+            return issubclass(cls, classinfo) and cls is not classinfo
+        subtype = cupy.ndarray
+        template = None
+        for in_arg in in_args:
+            in_arg_type = type(in_arg)
+            if issubclass1(in_arg_type, cupy.ndarray):
+                subtype = in_arg_type
+                template = in_arg
+                break
+        out_args = _get_out_args_from_optionals(
+            subtype, out_args, op.out_types, shape, casting, template)
+        if self.nout == 1:
+            ret = out_args[0]
+        else:
+            ret = tuple(out_args)
+        if _contains_zero(shape):
+            return ret
+        inout_args = []
+        for i, t in enumerate(op.in_types):
+            x = broad_values[i]
+            inout_args.append(
+                x if isinstance(x, _ndarray_base) else
+                _scalar.CScalar.from_numpy_scalar_with_dtype(x, t))
+        if has_where:
+            x = broad_values[self.nin]
+            inout_args.append(x)
+        inout_args.extend(out_args)
+        shape = _reduce_dims(inout_args, self._params, shape)
+        indexer = _carray._indexer_init(shape)
+        inout_args.append(indexer)
+        arginfos = _get_arginfos(inout_args)
+        kern = self._get_ufunc_kernel(dev_id, op, arginfos, has_where)
+        kern.linear_launch(indexer.size, inout_args)
+        return ret
+    cdef str _get_name_with_type(self, tuple arginfos, bint has_where):
+        cdef str name = self.name
+        if has_where:
+            name += '_where'
+        cdef _ArgInfo arginfo
+        inout_type_words = []
+        for arginfo in arginfos:
+            dtype = str(numpy.dtype(arginfo.dtype))
+            if arginfo.is_ndarray():
+                inout_type_words.append(dtype)
+            elif arginfo.is_scalar():
+                inout_type_words.append(dtype.rstrip('0123456789'))
+        return '{}__{}'.format(name, '_'.join(inout_type_words))
+    cdef function.Function _get_ufunc_kernel(
+            self, int dev_id, _Op op, tuple arginfos, bint has_where):
+        cdef function.Function kern
+        key = (dev_id, op, arginfos, has_where)
+        kern = self._kernel_memo.get(key, None)
+        if kern is None:
+            name = self._get_name_with_type(arginfos, has_where)
+            params = self._params_with_where if has_where else self._params
+            kern = _get_ufunc_kernel(
+                op.in_types, op.out_types, op.routine, arginfos, has_where,
+                params, name, self._preamble, self._loop_prep)
+            self._kernel_memo[key] = kern
+        return kern
+    def outer(self, A, B, **kwargs):
+        """Apply the ufunc operation to all pairs of elements in A and B.
+        .. seealso::
+           :meth:`numpy.ufunc.outer`
+        """
+        A = core.array(A)
+        B = core.array(B)
+        ndim_a = A.ndim
+        ndim_b = B.ndim
+        A = A.reshape(A.shape + (1,) * ndim_b)
+        B = B.reshape((1,) * ndim_a + B.shape)
+        return self(A, B, **kwargs)
+    def at(self, a, indices, b=None):
+        """Apply in place operation on the operand ``a`` for elements
+        specified by ``indices``.
+        .. seealso::
+           :meth:`numpy.ufunc.at`
+        """
+        if self._scatter_op is not None:
+            a._scatter_op(indices, b, self._scatter_op)
+        else:
+            raise NotImplementedError(f'`{self.name}.at` is not supported yet')
+    def reduce(self, array, axis=0, dtype=None, out=None, keepdims=False):
+        """Reduce ``array`` applying ufunc.
+        .. seealso::
+           :meth:`numpy.ufunc.reduce`
+        """
+        if self.name == 'cupy_add':
+            return array.sum(axis, dtype, out, keepdims)
+        if self.name == 'cupy_multiply':
+            return array.prod(axis, dtype, out, keepdims)
+        raise NotImplementedError(f'`{self.name}.reduce` is not supported yet')
+    def accumulate(self, array, axis=0, dtype=None, out=None):
+        """Accumulate ``array`` applying ufunc.
+        .. seealso::
+           :meth:`numpy.ufunc.accumulate`
+        """
+        if self.name == 'cupy_add':
+            return array.cumsum(axis, dtype, out)
+        if self.name == 'cupy_multiply':
+            return array.cumprod(axis, dtype, out)
+        raise NotImplementedError(
+            f'`{self.name}.accumulate` is not supported yet')
+    def reduceat(self, array, indices, axis=0, dtype=None, out=None):
+        """Reduce ``array`` applying ufunc with indices.
+        .. seealso::
+           :meth:`numpy.ufunc.reduceat`
+        """
+        if self.name == 'cupy_add':
+            return array._add_reduceat(indices, axis, dtype, out)
+        raise NotImplementedError(
+            f'`{self.name}.reduceat` is not supported yet')
+cdef class _Op:
+    def __init__(
+            self, tuple in_types, tuple out_types, object routine,
+            object error_func):
+        if error_func is None:
+            assert routine is not None
+        else:
+            assert callable(error_func)
+        self.in_types = in_types
+        self.out_types = out_types
+        self.nin = len(in_types)
+        self.nout = len(out_types)
+        self.routine = routine
+        self.error_func = error_func
+    @staticmethod
+    cdef _Op _from_type_and_routine_or_error_func(
+            str typ, object routine, object error_func):
+        # TODO(niboshi): Write type mapping specification.
+        types = typ.split('->')
+        if len(types) == 1:
+            in_types = out_types = tuple(types)
+        else:
+            in_types, out_types = map(tuple, types)
+        in_types = tuple([get_dtype(t).type for t in in_types])
+        out_types = tuple([get_dtype(t).type for t in out_types])
+        return _Op(in_types, out_types, routine, error_func)
+    @staticmethod
+    cdef _Op from_type_and_routine(str typ, routine):
+        return _Op._from_type_and_routine_or_error_func(typ, routine, None)
+    @staticmethod
+    cdef _Op from_type_and_error_func(str typ, error_func):
+        return _Op._from_type_and_routine_or_error_func(typ, None, error_func)
+    cdef check_valid(self):
+        if self.error_func is not None:
+            self.error_func()
+    cpdef tuple get_in_dtypes(self):
+        return tuple([get_dtype(t) for t in self.in_types])
+    cpdef tuple get_out_dtypes(self):
+        return tuple([get_dtype(t) for t in self.out_types])
+cdef class _Ops:
+    def __init__(self, tuple ops):
+        assert len(ops) > 0
+        nin = ops[0].nin
+        nout = ops[0].nout
+        assert all(op.nin == nin for op in ops)
+        assert all(op.nout == nout for op in ops)
+        self.ops = ops
+        self.nin = nin
+        self.nout = nout
+    @staticmethod
+    cdef _Ops from_tuples(object ops, routine):
+        ops_ = []
+        for t in ops:
+            if isinstance(t, tuple):
+                typ, rt = t
+                if isinstance(rt, tuple):
+                    rt = tuple([r1 or r2 for r1, r2 in zip(rt, routine)])
+                elif not isinstance(rt, str):
+                    assert callable(rt)
+                    ops_.append(_Op.from_type_and_error_func(typ, rt))
+                    continue
+            else:
+                assert isinstance(t, str)
+                typ, rt = t, routine
+            ops_.append(_Op.from_type_and_routine(typ, rt))
+        return _Ops(tuple(ops_))
+    cpdef _Op guess_routine(
+            self, str name, dict cache, list in_args, dtype, _Ops out_ops):
+        cdef _Ops ops_
+        if dtype is None:
+            use_raw_value = _check_should_use_min_scalar(in_args)
+            if use_raw_value:
+                in_types = tuple([
+                    a.dtype.type if isinstance(a, _ndarray_base)
+                    else _min_scalar_type(a)
+                    for a in in_args])
+            else:
+                in_types = tuple([a.dtype.type for a in in_args])
+            op = cache.get(in_types, ())
+            if op is ():
+                op = self._guess_routine_from_in_types(in_types)
+                cache[in_types] = op
+        else:
+            op = cache.get(dtype, ())
+            if op is ():
+                ops_ = out_ops or self
+                op = ops_._guess_routine_from_dtype(dtype)
+                cache[dtype] = op
+        if op is not None:
+            # raise TypeError if the type combination is disallowed
+            (<_Op>op).check_valid()
+            return op
+        if dtype is None:
+            dtype = tuple([a.dtype.type for a in in_args])
+        raise TypeError('Wrong type (%s) of arguments for %s' %
+                        (dtype, name))
+    cpdef _Op _guess_routine_from_in_types(
+            self, tuple in_types, object can_cast=_numpy_can_cast):
+        cdef _Op op
+        cdef tuple op_types
+        cdef Py_ssize_t n = len(in_types)
+        cdef Py_ssize_t i
+        for op in self.ops:
+            op_types = op.in_types
+            for i in range(n):
+                it = in_types[i]
+                ot = op_types[i]
+                if isinstance(it, tuple):
+                    if not can_cast(it[0], ot) and not can_cast(it[1], ot):
+                        break
+                elif not can_cast(it, ot):
+                    break
+            else:
+                return op
+        return None
+    cpdef _Op _guess_routine_from_dtype(self, object dtype):
+        cdef _Op op
+        cdef tuple op_types
+        for op in self.ops:
+            op_types = op.out_types
+            for t in op_types:
+                if t != dtype:
+                    break
+            else:
+                return op
+        return None
+cpdef create_ufunc(name, ops, routine=None, preamble='', doc='',
+                   default_casting=None, loop_prep='', out_ops=None,
+                   cutensor_op=None, scatter_op=None):
+    ops_ = _Ops.from_tuples(ops, routine)
+    _out_ops = None if out_ops is None else _Ops.from_tuples(out_ops, routine)
+    return ufunc(
+        name, ops_.nin, ops_.nout, ops_, preamble,
+        loop_prep, doc, default_casting=default_casting, out_ops=_out_ops,
+        cutensor_op=cutensor_op, scatter_op=scatter_op)
--- a/cupy/_core/_memory_range.pxd
+++ b/cupy/_core/_memory_range.pxd
+from cupy._core.core cimport _ndarray_base
+from libcpp.pair cimport pair
+cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array)
+cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b)
--- a/cupy/_core/_memory_range.pyx
+++ b/cupy/_core/_memory_range.pyx
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport memory
+from libcpp.pair cimport pair
+cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array):
+    cdef Py_ssize_t left = array.data.ptr
+    cdef Py_ssize_t right = left
+    cdef Py_ssize_t tmp
+    cdef pair[Py_ssize_t, Py_ssize_t] ret
+    cdef size_t i
+    for i in range(array._shape.size()):
+        # shape[i] != 0 is assumed
+        tmp = (array._shape[i] - 1) * array._strides[i]
+        if tmp > 0:
+            right += tmp
+        else:
+            left += tmp
+    ret.first = left
+    ret.second = right + <Py_ssize_t>array.dtype.itemsize
+    return ret
+cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b):
+    cdef memory.MemoryPointer a_data = a.data
+    cdef memory.MemoryPointer b_data = b.data
+    cdef pair[Py_ssize_t, Py_ssize_t] a_range, b_range
+    if (a_data.device_id != b_data.device_id
+            or a_data.mem.ptr != b_data.mem.ptr
+            or a.size == 0 or b.size == 0):
+        return False
+    a_range = get_bound(a)
+    b_range = get_bound(b)
+    return a_range.first < b_range.second and b_range.first < a_range.second
--- a/cupy/_core/_optimize_config.pxd
+++ b/cupy/_core/_optimize_config.pxd
+cdef object _thread_local
+cdef dict _contexts
+cdef class _OptimizationConfig:
+    cdef readonly object optimize_impl
+    cdef readonly int max_trials
+    cdef readonly float timeout
+    cdef readonly float expected_total_time_per_trial
+    cdef readonly float max_total_time_per_trial
+cdef class _OptimizationContext:
+    cdef readonly str key
+    cdef readonly _OptimizationConfig config
+    cdef readonly dict _params_map
+    cdef readonly bint _dirty
+cpdef _OptimizationContext get_current_context()
--- a/cupy/_core/_optimize_config.pyx
+++ b/cupy/_core/_optimize_config.pyx
+import pickle
+import threading
+cdef _thread_local = threading.local()
+cdef _contexts = {}
+cdef class _OptimizationConfig:
+    def __init__(
+            self, optimize_impl, *,
+            int max_trials=100,
+            float timeout=1,
+            float expected_total_time_per_trial=100 * 1e-6,
+            float max_total_time_per_trial=0.1):
+        self.optimize_impl = optimize_impl
+        self.max_trials = max_trials
+        self.timeout = timeout
+        self.expected_total_time_per_trial = expected_total_time_per_trial
+        self.max_total_time_per_trial = max_total_time_per_trial
+cdef class _OptimizationContext:
+    def __init__(self, str key, _OptimizationConfig config):
+        self.key = key
+        self.config = config
+        self._params_map = {}
+        self._dirty = False
+    def get_params(self, key):
+        return self._params_map.get(key)
+    def set_params(self, key, params):
+        self._params_map[key] = params
+        self._dirty = True
+    def save(self, filepath):
+        with open(filepath, mode='wb') as f:
+            pickle.dump((self.key, self._params_map), f)
+        self._dirty = False
+    def load(self, filepath):
+        with open(filepath, mode='rb') as f:
+            key, params_map = pickle.load(f)
+        if key != self.key:
+            raise ValueError(
+                'Optimization key mismatch {} != {}'.format(key, self.key))
+        self._params_map = params_map
+        self._dirty = False
+    def _is_dirty(self):
+        return self._dirty
+cpdef _OptimizationContext get_current_context():
+    try:
+        return _thread_local.current_context
+    except AttributeError:
+        return None
+def set_current_context(_OptimizationContext context):
+    _thread_local.current_context = context
+def get_new_context(
+        str key, object optimize_impl, dict config_dict):
+    c = _contexts.get(key)
+    if c is None:
+        config = _OptimizationConfig(optimize_impl, **config_dict)
+        c = _OptimizationContext(key, config)
+        _contexts[key] = c
+    return c
+def _clear_all_contexts_cache():
+    global _contexts
+    assert get_current_context() is None
+    _contexts = {}
--- a/cupy/_core/_reduction.pxd
+++ b/cupy/_core/_reduction.pxd
+from cupy._core._carray cimport shape_t
+from cupy._core cimport _kernel
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport function
+cdef Py_ssize_t _block_size
+cpdef tuple _get_axis(object axis, Py_ssize_t ndim)
+cpdef shape_t _get_out_shape(
+    const shape_t& shape, tuple reduce_axis, tuple out_axis, bint keepdims)
+cdef class _AbstractReductionKernel:
+    cdef:
+        readonly str name
+        public str identity
+        readonly tuple in_params
+        readonly tuple out_params
+        readonly tuple _params
+        readonly str __name__
+        readonly dict _cached_codes
+    cpdef _ndarray_base _call(
+        self,
+        list in_args, list out_args,
+        const shape_t& a_shape, axis, dtype,
+        bint keepdims, bint reduce_dims, int device_id,
+        stream, bint try_use_cub=*, bint sort_reduce_axis=*)
+    cdef void _launch(
+        self, out_block_num, block_size, block_stride,
+        in_args, out_args, in_shape, out_shape, types,
+        map_expr, reduce_expr, post_map_expr, reduce_type,
+        stream, params)
+    cdef tuple _get_expressions_and_types(
+        self, list in_args, list out_args, dtype)
+    cdef list _get_out_args(
+        self, list out_args, tuple out_types, const shape_t& out_shape)
+    cdef function.Function _get_function(
+        self,
+        tuple params, tuple arginfos, _kernel._TypeMap types,
+        str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
+        Py_ssize_t block_size)
+cdef class ReductionKernel(_AbstractReductionKernel):
+    cdef:
+        readonly int nin
+        readonly int nout
+        readonly int nargs
+        readonly tuple params
+        readonly str reduce_expr
+        readonly str map_expr
+        readonly str post_map_expr
+        readonly object options
+        readonly bint reduce_dims
+        readonly object reduce_type
+        readonly str preamble
+cdef shape_t _set_permuted_args(
+    list args, tuple axis_permutes, const shape_t& shape, tuple params)
+cdef tuple _get_shape_and_strides(list in_args, list out_args)
+cdef _optimizer_copy_arg(a)
+cpdef create_reduction_func(
+    name, ops, routine=*, identity=*, preamble=*, sort_reduce_axis=*)
--- a/cupy/_core/_reduction.pyx
+++ b/cupy/_core/_reduction.pyx
+from cpython cimport sequence
+from cupy._core cimport _carray
+from cupy._core cimport _accelerator
+from cupy._core._carray cimport shape_t
+from cupy._core cimport _cub_reduction
+from cupy._core._dtype cimport get_dtype
+from cupy._core cimport _kernel
+from cupy._core._kernel cimport _broadcast
+from cupy._core._kernel cimport _check_peer_access
+from cupy._core._kernel cimport _get_arginfos
+from cupy._core._kernel cimport _get_out_args_from_optionals
+from cupy._core._kernel cimport _get_out_args_with_params
+from cupy._core._kernel cimport _preprocess_args
+from cupy._core._kernel cimport _reduce_dims
+from cupy._core._kernel cimport ParameterInfo, _ArgInfo
+from cupy._core cimport _optimize_config
+from cupy._core cimport _routines_manipulation as _manipulation
+from cupy._core cimport _scalar
+from cupy._core._scalar import get_typename as _get_typename
+from cupy._core.core cimport _convert_object_with_cuda_array_interface
+from cupy._core.core cimport _create_ndarray_from_shape_strides
+from cupy._core.core cimport compile_with_cache
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport internal
+from cupy.cuda cimport device
+from cupy.cuda cimport function
+from cupy_backends.cuda.api cimport runtime
+import math
+import string
+import warnings
+import numpy
+import cupy
+from cupy._core._kernel import _get_param_info
+from cupy._core._kernel import _decide_params_type
+from cupy._core._ufuncs import elementwise_copy
+from cupy.cuda import compiler
+from cupy import _util
+cpdef str _create_reduction_function_code(
+        name, block_size, reduce_type, params, arginfos, identity,
+        pre_map_expr, reduce_expr, post_map_expr,
+        _kernel._TypeMap type_map, input_expr, output_expr, preamble, options):
+    # A (incomplete) list of internal variables:
+    # _J            : the index of an element in the array
+    # _block_size   : the number of threads in a block; should be power of 2
+    # _block_stride : the number of elements being processed by a block; should
+    #                 be power of 2 and <= _block_size
+    module_code = string.Template('''
+${type_preamble}
+${preamble}
+#define REDUCE(a, b) (${reduce_expr})
+#define POST_MAP(a) (${post_map_expr})
+#define _REDUCE(_offset) if (_tid < _offset) { \
+  _type_reduce _a = _sdata[_tid], _b = _sdata[(_tid + _offset)]; \
+  _sdata[_tid] = REDUCE(_a, _b); \
+}
+typedef ${reduce_type} _type_reduce;
+extern "C" __global__ void ${name}(${params}) {
+  __shared__ char _sdata_raw[${block_size} * sizeof(_type_reduce)];
+  _type_reduce *_sdata = reinterpret_cast<_type_reduce*>(_sdata_raw);
+  unsigned int _tid = threadIdx.x;
+  int _J_offset = _tid >> __popc(_block_stride - 1);  // _tid / _block_stride
+  ptrdiff_t _j_offset = (ptrdiff_t)_J_offset * _out_ind.size();
+  int _J_stride = ${block_size} >> __popc(_block_stride - 1);
+  ptrdiff_t _j_stride = (ptrdiff_t)_J_stride * _out_ind.size();
+  for (ptrdiff_t _i_base = (ptrdiff_t)blockIdx.x * _block_stride;
+       _i_base < _out_ind.size();
+       _i_base += (ptrdiff_t)gridDim.x * _block_stride) {
+    _type_reduce _s = _type_reduce(${identity});
+    ptrdiff_t _i =
+        _i_base + (_tid & (_block_stride - 1));  // _tid % _block_stride
+    int _J = _J_offset;
+    for (ptrdiff_t _j = _i + _j_offset; _j < _in_ind.size();
+         _j += _j_stride, _J += _J_stride) {
+      _in_ind.set(_j);
+      ${input_expr}
+      _type_reduce _a = static_cast<_type_reduce>(${pre_map_expr});
+      _s = REDUCE(_s, _a);
+    }
+    _sdata[_tid] = _s;
+    __syncthreads();
+    for (unsigned int _block = ${block_size} / 2;
+         _block >= _block_stride; _block >>= 1) {
+      if (_tid < _block) {
+        _REDUCE(_block);
+      }
+      __syncthreads();
+    }
+    if (_tid < _block_stride) {
+      _s = _sdata[_tid];
+    }
+    if (_tid < _block_stride && _i < _out_ind.size()) {
+      _out_ind.set(static_cast<ptrdiff_t>(_i));
+      ${output_expr}
+      POST_MAP(_s);
+    }
+  }
+}''').substitute(
+        name=name,
+        block_size=block_size,
+        reduce_type=reduce_type,
+        params=_kernel._get_kernel_params(params, arginfos),
+        identity=identity,
+        reduce_expr=reduce_expr,
+        pre_map_expr=pre_map_expr,
+        post_map_expr=post_map_expr,
+        type_preamble=type_map.get_typedef_code(),
+        input_expr=input_expr,
+        output_expr=output_expr,
+        preamble=preamble)
+    return module_code
+cpdef function.Function _create_reduction_function_from_code(
+        name, code, options):
+    module = compile_with_cache(code, options)
+    return module.get_function(name)
+cpdef function.Function _create_reduction_function(
+        name, block_size, reduce_type, params, arginfos, identity,
+        pre_map_expr, reduce_expr, post_map_expr,
+        _kernel._TypeMap type_map, input_expr, output_expr, preamble, options):
+    code = _create_reduction_function_code(
+        name, block_size, reduce_type, params, arginfos, identity,
+        pre_map_expr, reduce_expr, post_map_expr, type_map, input_expr,
+        output_expr, preamble, options
+    )
+    return _create_reduction_function_from_code(name, code, options)
+cpdef tuple _get_axis(object axis, Py_ssize_t ndim):
+    cdef Py_ssize_t dim
+    if axis is None:
+        return (tuple(range(ndim)), ())
+    elif sequence.PySequence_Check(axis):
+        axis = tuple(axis)
+    else:
+        axis = axis,
+    reduce_axis = tuple(sorted(
+        [internal._normalize_axis_index(dim, ndim) for dim in axis]))
+    out_axis = tuple([dim for dim in range(ndim) if dim not in reduce_axis])
+    if len(reduce_axis) + len(out_axis) != ndim:
+        raise ValueError("duplicate value in 'axis'")
+    return reduce_axis, out_axis
+cpdef shape_t _get_out_shape(
+        const shape_t& shape, tuple reduce_axis, tuple out_axis,
+        bint keepdims):
+    cdef shape_t out_shape
+    if keepdims:
+        out_shape = shape
+        for i in reduce_axis:
+            out_shape[i] = 1
+    else:
+        out_shape.reserve(len(out_axis))
+        for i in out_axis:
+            out_shape.push_back(shape[i])
+    return out_shape
+cdef shape_t _set_permuted_args(
+        list args, tuple axis_permutes, const shape_t& shape, tuple params):
+    # This function updates `args`
+    cdef ParameterInfo p
+    cdef Py_ssize_t i, s
+    cdef bint need_permutation = False
+    cdef shape_t out_shape
+    for i, s in enumerate(axis_permutes):
+        if i != s:
+            need_permutation = True
+            break
+    if need_permutation:
+        for p in params:
+            if p.raw:
+                raise NotImplementedError('Illegal conditions')
+        for i, a in enumerate(args):
+            if isinstance(a, _ndarray_base):
+                args[i] = _manipulation._transpose(a, axis_permutes)
+        out_shape.reserve(len(axis_permutes))
+        for i in axis_permutes:
+            out_shape.push_back(shape[i])
+        return out_shape
+    else:
+        return shape
+cdef Py_ssize_t _get_contiguous_size(
+        list args, tuple params, list out_shape, Py_ssize_t ndim) except -1:
+    '''
+    get contiguous size in the *output* axis (not *reduce* axis!)
+    '''
+    cdef int i, j
+    cdef ParameterInfo p
+    cdef Py_ssize_t contiguous_size, tmp_contiguous_size, itemsize
+    out_ndim = len(out_shape)
+    contiguous_size = 1
+    for i, a in enumerate(args):
+        if not isinstance(a, _ndarray_base):
+            continue
+        p = params[i]
+        if p.raw:
+            continue
+        tmp_contiguous_size = 1
+        itemsize = a.dtype.itemsize
+        for j in range(out_ndim):
+            if a._strides[ndim-j-1] != tmp_contiguous_size * itemsize:
+                break
+            tmp_contiguous_size *= out_shape[out_ndim-j-1]
+        contiguous_size = max(contiguous_size, tmp_contiguous_size)
+    return contiguous_size
+cdef Py_ssize_t _default_block_size = (
+    256 if runtime._is_hip_environment else 512)
+cdef Py_ssize_t _min_block_size_log = 5
+cdef Py_ssize_t _max_block_size_log = (
+    8 if runtime._is_hip_environment else 9)
+cpdef (Py_ssize_t, Py_ssize_t, Py_ssize_t) _get_block_specs(  # NOQA
+        Py_ssize_t in_size, Py_ssize_t out_size,
+        Py_ssize_t contiguous_size,
+        Py_ssize_t block_size) except*:
+    cdef Py_ssize_t reduce_block_size, block_stride, out_block_num
+    if block_size == -1:
+        block_size = _default_block_size
+    reduce_block_size = max(1, in_size // out_size)
+    contiguous_size = min(contiguous_size, 32)
+    block_stride = max(contiguous_size, block_size // reduce_block_size)
+    block_stride = internal.clp2(block_stride // 2 + 1)  # floor
+    out_block_num = (out_size + block_stride - 1) // block_stride
+    return block_size, block_stride, out_block_num
+cdef tuple _sort_axis(tuple axis, tuple strides):
+    # Sorts axis in the decreasing order of absolute values of strides.
+    return tuple(sorted(axis, key=lambda i: -abs(strides[i])))
+cdef tuple _get_shape_and_strides(list in_args, list out_args):
+    cdef list shape_and_strides = []
+    for x in in_args + out_args:
+        if isinstance(x, _ndarray_base):
+            shape_and_strides.append(x.shape)
+            shape_and_strides.append(x.strides)
+        else:
+            shape_and_strides.append(None)
+            shape_and_strides.append(None)
+    return tuple(shape_and_strides)
+cdef _optimizer_copy_arg(a):
+    if isinstance(a, _ndarray_base):
+        x = _create_ndarray_from_shape_strides(
+            cupy.ndarray, a._shape, a._strides, a.dtype, None)
+        assert a.data.device_id == x.data.device_id
+        elementwise_copy(a, x)
+        return x
+    return a
+cdef class _AbstractReductionKernel:
+    def __init__(
+            self, str name, str identity, str in_params, str out_params):
+        assert name is not None
+        assert identity is not None
+        assert in_params is not None
+        assert out_params is not None
+        in_params_ = _get_param_info(in_params, True)
+        out_params_ = _get_param_info(out_params, False)
+        params = (
+            in_params_
+            + out_params_
+            + _get_param_info('CIndexer _in_ind, CIndexer _out_ind', False)
+            + _get_param_info('int32 _block_stride', True))
+        self.name = name
+        self.identity = identity
+        self.in_params = in_params_
+        self.out_params = out_params_
+        self._params = params
+        # This is for profiling mechanisms to auto infer a name
+        self.__name__ = name
+        self._cached_codes = {}
+    cpdef _ndarray_base _call(
+            self,
+            list in_args, list out_args,
+            const shape_t& a_shape, axis, dtype,
+            bint keepdims, bint reduce_dims, int device_id,
+            stream, bint try_use_cub=False, bint sort_reduce_axis=True):
+        cdef tuple reduce_axis, out_axis, axis_permutes
+        cdef tuple params, opt_params
+        cdef tuple shape_and_strides
+        cdef Py_ssize_t contiguous_size = -1
+        cdef Py_ssize_t block_size, block_stride, out_block_num = 0
+        cdef shape_t in_shape, out_shape
+        cdef _ndarray_base ret
+        cdef bint cub_success
+        if dtype is not None:
+            dtype = get_dtype(dtype).type
+        (
+            map_expr, reduce_expr, post_map_expr,
+            in_types, out_types, reduce_type,
+            type_map,
+        ) = self._get_expressions_and_types(in_args, out_args, dtype)
+        reduce_axis, out_axis = _get_axis(axis, a_shape.size())
+        # When there is only one input array, sort the axes in such a way that
+        # contiguous (C or F) axes can be squashed in _reduce_dims() later.
+        # TODO(niboshi): Support (out_axis) > 1
+        if (len(in_args) == 1
+                and len(out_axis) <= 1
+                and not in_args[0]._c_contiguous):
+            strides = in_args[0].strides
+            if sort_reduce_axis:
+                reduce_axis = _sort_axis(reduce_axis, strides)
+            out_axis = _sort_axis(out_axis, strides)
+        out_shape = _get_out_shape(a_shape, reduce_axis, out_axis, keepdims)
+        out_args = self._get_out_args(out_args, out_types, out_shape)
+        ret = out_args[0]
+        if ret.size == 0:
+            return ret
+        if self.identity == '' and internal.is_in(a_shape, 0):
+            raise ValueError(('zero-size array to reduction operation'
+                              ' %s which has no identity') % self.name)
+        in_args = [x if isinstance(x, _ndarray_base) else
+                   _scalar.CScalar.from_numpy_scalar_with_dtype(x, t)
+                   for x, t in zip(in_args, in_types)]
+        optimize_context = _optimize_config.get_current_context()
+        key = ()
+        if optimize_context is not None:
+            # Calculate a key unique to the reduction setting.
+            shape_and_strides = _get_shape_and_strides(in_args, out_args)
+            key = (self.name, shape_and_strides,
+                   in_types, out_types, reduce_type, device_id)
+        # Try to use CUB
+        for accelerator in _accelerator._reduction_accelerators:
+            if try_use_cub and accelerator == _accelerator.ACCELERATOR_CUB:
+                cub_success = _cub_reduction._try_to_call_cub_reduction(
+                    self, in_args, out_args, a_shape, stream, optimize_context,
+                    key, map_expr, reduce_expr, post_map_expr, reduce_type,
+                    type_map, reduce_axis, out_axis, out_shape, ret)
+                if cub_success:
+                    return ret
+        axis_permutes = reduce_axis + out_axis
+        in_shape = _set_permuted_args(
+            in_args, axis_permutes, a_shape, self.in_params)
+        if reduce_dims:
+            in_shape = _reduce_dims(in_args, self.in_params, in_shape)
+            out_shape = _reduce_dims(out_args, self.out_params, out_shape)
+        params = self._params
+        # Calculate the reduction block dimensions.
+        if optimize_context is None:
+            # Calculate manually
+            contiguous_size = _get_contiguous_size(
+                in_args, self.in_params, out_shape, in_shape.size())
+            block_size, block_stride, out_block_num = _get_block_specs(
+                internal.prod(in_shape),
+                internal.prod(out_shape),
+                contiguous_size, -1)
+        else:
+            # Optimize dynamically
+            key = ('simple_reduction',) + key
+            opt_params = optimize_context.get_params(key)
+            if opt_params is None:
+                opt_params = self._get_optimized_params(
+                    optimize_context.config, in_args, out_args,
+                    in_shape, out_shape, type_map, map_expr, reduce_expr,
+                    post_map_expr, reduce_type, stream)
+                optimize_context.set_params(key, opt_params)
+            block_size, block_stride, out_block_num = opt_params
+        # Launch the kernel
+        self._launch(
+            out_block_num,
+            block_size,
+            block_stride,
+            in_args, out_args,
+            in_shape, out_shape,
+            type_map,
+            map_expr, reduce_expr, post_map_expr, reduce_type,
+            stream, params)
+        return ret
+    def _get_optimized_params(
+            self, optimize_config, in_args, out_args, in_shape, out_shape,
+            type_map, map_expr, reduce_expr, post_map_expr, reduce_type,
+            stream):
+        out_size = internal.prod(out_shape)
+        in_args = [_optimizer_copy_arg(a) for a in in_args]
+        out_args = [_optimizer_copy_arg(a) for a in out_args]
+        contiguous_size = _get_contiguous_size(
+            in_args, self.in_params, out_shape, len(in_shape))
+        block_size, block_stride, default_out_block_num = _get_block_specs(
+            internal.prod(in_shape),
+            internal.prod(out_shape),
+            contiguous_size, -1)
+        default_block_size_log = math.floor(math.log2(block_size))
+        default_block_stride_log = math.floor(math.log2(block_stride))
+        def target_func(block_size, block_stride, out_block_num):
+            self._launch(
+                out_block_num, block_size, block_stride, in_args, out_args,
+                in_shape, out_shape, type_map, map_expr, reduce_expr,
+                post_map_expr, reduce_type, stream, self._params)
+        def suggest_func(trial):
+            block_size_log = trial.suggest_int(
+                'block_size_log', _min_block_size_log, _max_block_size_log)
+            block_size = 2 ** block_size_log
+            block_stride_log = trial.suggest_int(
+                'block_stride_log', 0, block_size_log)
+            block_stride = 2 ** block_stride_log
+            max_out_block_num = (out_size + block_stride - 1) // block_stride
+            out_block_num = trial.suggest_int(
+                'out_block_num', 1, max_out_block_num)
+            trial.set_user_attr('block_size', block_size)
+            trial.set_user_attr('block_stride', block_stride)
+            return block_size, block_stride, out_block_num
+        optimize_impl = optimize_config.optimize_impl
+        best = optimize_impl(
+            optimize_config, target_func, suggest_func,
+            default_best={
+                'block_size_log': default_block_size_log,
+                'block_stride_log': default_block_stride_log,
+                'out_block_num': default_out_block_num,
+            }
+        )
+        return (
+            best.user_attrs['block_size'],
+            best.user_attrs['block_stride'],
+            best.params['out_block_num'])
+    cdef inline void _launch(
+            self, out_block_num, block_size, block_stride,
+            in_args, out_args, in_shape, out_shape, type_map,
+            map_expr, reduce_expr, post_map_expr, reduce_type,
+            stream, params):
+        cdef function.Function func
+        inout_args = (
+            in_args
+            + out_args
+            + [
+                _carray._indexer_init(in_shape),
+                _carray._indexer_init(out_shape),
+                # block_stride is passed as the last argument.
+                _scalar.CScalar.from_int32(block_stride),
+            ])
+        # Retrieve the kernel function
+        func = self._get_function(
+            params,
+            _get_arginfos(inout_args),
+            type_map,
+            map_expr, reduce_expr, post_map_expr, reduce_type,
+            block_size)
+        # Launch the kernel
+        func.linear_launch(
+            out_block_num * block_size, inout_args, 0, block_size, stream)
+    cdef tuple _get_expressions_and_types(
+            self, list in_args, list out_args, dtype):
+        raise NotImplementedError()
+    cdef list _get_out_args(
+            self, list out_args, tuple out_types, const shape_t& out_shape):
+        raise NotImplementedError()
+    cdef function.Function _get_function(
+            self,
+            tuple params, tuple arginfos, _kernel._TypeMap type_map,
+            str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
+            Py_ssize_t block_size):
+        raise NotImplementedError()
+    @property
+    def cached_codes(self):
+        """Returns a dict that has input types as keys and codes values.
+        This proprety method is for debugging purpose.
+        The return value is not guaranteed to keep backward compatibility.
+        """
+        if len(self._cached_codes) == 0:
+            warnings.warn(
+                'No codes are cached because compilation is deferred until '
+                'the first function call or CUB is enabled.')
+        return dict([(k, v) for k, v in self._cached_codes.items()])
+    @property
+    def cached_code(self):
+        """Returns `next(iter(self.cached_codes.values()))`.
+        This proprety method is for debugging purpose.
+        The return value is not guaranteed to keep backward compatibility.
+        """
+        codes = self._cached_codes
+        if len(codes) > 1:
+            warnings.warn(
+                'The input types of the kernel could not be inferred. '
+                'Please use `.cached_codes` instead.')
+        return next(iter(codes.values()))
+# -----------------------------------------------------------------------------
+# create_reduction_func
+# -----------------------------------------------------------------------------
+cpdef _SimpleReductionKernel create_reduction_func(
+        name, ops, routine=None, identity=None, preamble='',
+        sort_reduce_axis=True):
+    ops = _kernel._Ops.from_tuples(ops, routine)
+    return _SimpleReductionKernel(
+        name, ops, identity, preamble, sort_reduce_axis)
+cdef class _SimpleReductionKernel(_AbstractReductionKernel):
+    cdef:
+        readonly _kernel._Ops _ops
+        readonly str preamble
+        readonly int nin
+        readonly int nout
+        readonly str _input_expr
+        readonly str _output_expr
+        readonly dict _routine_cache
+        readonly bint _sort_reduce_axis
+    def __init__(
+            self, name, _kernel._Ops ops, identity, preamble,
+            sort_reduce_axis=True):
+        super().__init__(
+            name,
+            '' if identity is None else str(identity),
+            'T in0',
+            'T out0',
+        )
+        self._ops = ops
+        self.preamble = preamble
+        self.nin = 1
+        self.nout = 1
+        self._input_expr = 'const type_in0_raw in0 = _raw_in0[_in_ind.get()];'
+        self._output_expr = 'type_out0_raw &out0 = _raw_out0[_out_ind.get()];'
+        self._routine_cache = {}
+        self._sort_reduce_axis = sort_reduce_axis
+    def __call__(self, object a, axis=None, dtype=None, _ndarray_base out=None,
+                 bint keepdims=False):
+        cdef _ndarray_base arr
+        if isinstance(a, _ndarray_base):
+            arr = a
+        elif hasattr(a, '__cuda_array_interface__'):
+            arr = _convert_object_with_cuda_array_interface(a)
+        elif hasattr(a, '__cupy_get_ndarray__'):
+            arr = a.__cupy_get_ndarray__()
+        else:
+            raise TypeError(
+                'Argument \'a\' has incorrect type (expected %s, got %s)' %
+                (cupy.ndarray, type(a)))
+        in_args = [arr]
+        dev_id = device.get_device_id()
+        _check_peer_access(arr, dev_id)
+        if out is None:
+            out_args = []
+        else:
+            _check_peer_access(out, dev_id)
+            out_args = [out]
+        reduce_dims = True
+        return self._call(
+            in_args, out_args,
+            arr._shape, axis, dtype, keepdims, reduce_dims, dev_id,
+            None, True, self._sort_reduce_axis)
+    cdef tuple _get_expressions_and_types(
+            self, list in_args, list out_args, dtype):
+        cdef _kernel._Op op
+        op = self._ops.guess_routine(
+            self.name, self._routine_cache, in_args, dtype, self._ops)
+        map_expr, reduce_expr, post_map_expr, reduce_type = op.routine
+        if reduce_type is None:
+            reduce_type = _get_typename(op.out_types[0])
+        if out_args:
+            out_type = out_args[0].dtype.type
+        else:
+            out_type = op.out_types[0]
+        # We guessed a routine that requires a C2R casting for the input
+        if (in_args[0].dtype.kind == 'c'
+                and numpy.dtype(op.in_types[0]).kind == 'f'):
+            warnings.warn(
+                'Casting complex values to real discards the imaginary part',
+                numpy.ComplexWarning)
+            in_args[0] = in_args[0].real
+        type_map = _kernel._TypeMap((
+            ('type_in0_raw', in_args[0].dtype.type),
+            ('type_out0_raw', out_type),
+        ))
+        return (
+            map_expr, reduce_expr, post_map_expr,
+            op.in_types, op.out_types, reduce_type,
+            type_map)
+    cdef list _get_out_args(
+            self, list out_args, tuple out_types, const shape_t& out_shape):
+        return _get_out_args_from_optionals(
+            cupy.ndarray, out_args, out_types, out_shape, 'unsafe', None)
+    cdef function.Function _get_function(
+            self,
+            tuple params, tuple arginfos, _kernel._TypeMap type_map,
+            str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
+            Py_ssize_t block_size):
+        in_types = []
+        for x in arginfos:
+            if x.type is cupy.ndarray:
+                in_types.append(cupy.dtype(x.dtype).char)
+        in_types = tuple(in_types)
+        if in_types not in self._cached_codes:
+            code = _SimpleReductionKernel_get_cached_function_code(
+                map_expr, reduce_expr, post_map_expr, reduce_type,
+                params, arginfos, type_map,
+                self.name, block_size, self.identity,
+                self._input_expr, self._output_expr, self.preamble, ())
+            self._cached_codes[in_types] = code
+        return _SimpleReductionKernel_get_cached_function(
+            map_expr, reduce_expr, post_map_expr, reduce_type,
+            params, arginfos, type_map,
+            self.name, block_size, self.identity,
+            self._input_expr, self._output_expr, self.preamble, ())
+@_util.memoize()
+def _SimpleReductionKernel_get_cached_function_code(
+        map_expr, reduce_expr, post_map_expr, reduce_type,
+        params, arginfos, _kernel._TypeMap type_map,
+        name, block_size, identity, input_expr, output_expr, preamble,
+        options):
+    return _create_reduction_function_code(
+        name, block_size, reduce_type, params, arginfos, identity,
+        map_expr, reduce_expr, post_map_expr,
+        type_map, input_expr, output_expr, preamble, options)
+@_util.memoize(for_each_device=True)
+def _SimpleReductionKernel_get_cached_function(
+        map_expr, reduce_expr, post_map_expr, reduce_type,
+        params, arginfos, _kernel._TypeMap type_map,
+        name, block_size, identity, input_expr, output_expr, preamble,
+        options):
+    return _create_reduction_function(
+        name, block_size, reduce_type, params, arginfos, identity,
+        map_expr, reduce_expr, post_map_expr,
+        type_map, input_expr, output_expr, preamble, options)
+# -----------------------------------------------------------------------------
+# ReductionKernel
+# -----------------------------------------------------------------------------
+cdef class ReductionKernel(_AbstractReductionKernel):
+    """User-defined reduction kernel.
+    This class can be used to define a reduction kernel with or without
+    broadcasting.
+    The kernel is compiled at an invocation of the
+    :meth:`~ReductionKernel.__call__` method, which is cached for each device.
+    The compiled binary is also cached into a file under the
+    ``$HOME/.cupy/kernel_cache/`` directory with a hashed file name. The cached
+    binary is reused by other processes.
+    Args:
+        in_params (str): Input argument list.
+        out_params (str): Output argument list.
+        map_expr (str): Mapping expression for input values.
+        reduce_expr (str): Reduction expression.
+        post_map_expr (str): Mapping expression for reduced values.
+        identity (str): Identity value for starting the reduction.
+        name (str): Name of the kernel function. It should be set for
+            readability of the performance profiling.
+        reduce_type (str): Type of values to be used for reduction. This type
+            is used to store the special variables ``a``.
+        reduce_dims (bool): If ``True``, input arrays are reshaped without copy
+            to smaller dimensions for efficiency.
+        preamble (str): Fragment of the CUDA-C/C++ code that is inserted at the
+            top of the cu file.
+        options (tuple of str): Additional compilation options.
+    """
+    def __init__(self, str in_params, str out_params,
+                 map_expr, reduce_expr, post_map_expr,
+                 identity, name='reduce_kernel', reduce_type=None,
+                 reduce_dims=True, preamble='', options=()):
+        if not compiler.is_valid_kernel_name(name):
+            raise ValueError(
+                'Invalid kernel name: "%s"' % name)
+        super().__init__(
+            name,
+            '' if identity is None else str(identity),
+            in_params,
+            out_params,
+        )
+        self.nin = len(self.in_params)
+        self.nout = len(self.out_params)
+        self.nargs = self.nin + self.nout
+        self.reduce_expr = reduce_expr
+        self.map_expr = map_expr
+        self.post_map_expr = post_map_expr
+        self.options = options
+        self.reduce_dims = reduce_dims
+        if reduce_type is None:
+            self.reduce_type = self.out_params[0].ctype
+        else:
+            self.reduce_type = reduce_type
+        self.preamble = preamble
+    def __call__(self, *args, **kwargs):
+        """Compiles and invokes the reduction kernel.
+        The compilation runs only if the kernel is not cached. Note that the
+        kernels with different argument dtypes, ndims, or axis are not
+        compatible. It means that single ReductionKernel object may be compiled
+        into multiple kernel binaries.
+        Args:
+            args: Arguments of the kernel.
+            out (cupy.ndarray): The output array. This can only be specified if
+                ``args`` does not contain the output array.
+            axis (int or tuple of ints): Axis or axes along which the
+                reduction is performed.
+            keepdims (bool): If ``True``, the specified axes are remained as
+                axes of length one.
+            stream (cupy.cuda.Stream, optional): The CUDA stream to launch the
+                kernel on. If not given, the current stream will be used.
+        Returns:
+            Arrays are returned according to the ``out_params`` argument of the
+            ``__init__`` method.
+        """
+        cdef shape_t broad_shape
+        out = kwargs.pop('out', None)
+        axis = kwargs.pop('axis', None)
+        keepdims = kwargs.pop('keepdims', False)
+        stream = kwargs.pop('stream', None)
+        if kwargs:
+            raise TypeError('Wrong arguments %s' % kwargs)
+        n_args = len(args)
+        if n_args != self.nin and n_args != self.nargs:
+            raise TypeError('Wrong number of arguments for %s' % self.name)
+        out_args = list(args[self.nin:])
+        if out is not None:
+            if self.nout != 1:
+                raise NotImplementedError('')
+            if len(out_args) != 0:
+                raise ValueError("cannot specify 'out' as both "
+                                 "a positional and keyword argument")
+            out_args = [out]
+        dev_id = device.get_device_id()
+        in_args = _preprocess_args(dev_id, args[:self.nin], False)
+        out_args = _preprocess_args(dev_id, out_args, False)
+        in_args = _broadcast(in_args, self.in_params, False, broad_shape)
+        return self._call(
+            in_args, out_args,
+            broad_shape, axis, None,
+            keepdims, self.reduce_dims, dev_id, stream, True, True)
+    cdef tuple _get_expressions_and_types(
+            self, list in_args, list out_args, dtype):
+        in_ndarray_types = tuple(
+            [a.dtype.type if isinstance(a, _ndarray_base) else None
+             for a in in_args])
+        out_ndarray_types = tuple(
+            [a.dtype.type if isinstance(a, _ndarray_base) else None
+             for a in out_args])
+        in_types, out_types, type_map = _decide_params_type(
+            self.in_params, self.out_params,
+            in_ndarray_types, out_ndarray_types)
+        return (
+            self.map_expr, self.reduce_expr, self.post_map_expr,
+            in_types, out_types, self.reduce_type,
+            type_map)
+    cdef list _get_out_args(
+            self, list out_args, tuple out_types, const shape_t& out_shape):
+        return _get_out_args_with_params(
+            out_args, out_types, out_shape, self.out_params, False)
+    cdef function.Function _get_function(
+            self,
+            tuple params, tuple arginfos, _kernel._TypeMap type_map,
+            str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
+            Py_ssize_t block_size):
+        in_types = []
+        for x in arginfos:
+            if x.type is cupy.ndarray:
+                in_types.append(cupy.dtype(x.dtype).char)
+        in_types = tuple(in_types)
+        if in_types not in self._cached_codes:
+            code =_ReductionKernel_get_cached_function_code(
+                self.nin, self.nout, params, arginfos, type_map,
+                self.name, block_size, reduce_type, self.identity,
+                map_expr, reduce_expr, post_map_expr,
+                self.preamble, self.options)
+            self._cached_codes[in_types] = code
+        return _ReductionKernel_get_cached_function(
+            self.nin, self.nout, params, arginfos, type_map,
+            self.name, block_size, reduce_type, self.identity,
+            map_expr, reduce_expr, post_map_expr,
+            self.preamble, self.options)
+@_util.memoize()
+def _ReductionKernel_get_cached_function_code(
+        nin, nout, params, arginfos, _kernel._TypeMap type_map,
+        name, block_size, reduce_type, identity, map_expr, reduce_expr,
+        post_map_expr, preamble, options):
+    cdef ParameterInfo p
+    cdef _ArgInfo arginfo
+    in_arrays = [
+        p for p, arginfo in zip(params[:nin], arginfos[:nin])
+        if not p.raw and arginfo.is_ndarray()]
+    out_arrays = [
+        p for p, arginfo in zip(params[nin:nin+nout], arginfos[nin:nin+nout])
+        if not p.raw and arginfo.is_ndarray()]
+    input_expr = '\n'.join(
+        [(('const {0} {1}' if p.is_const else '{0}& {1}') +
+          ' = _raw_{1}[_in_ind.get()];').format(p.ctype, p.name)
+         for p in in_arrays])
+    output_expr = '\n'.join(
+        ['{0} &{1} = _raw_{1}[_out_ind.get()];'.format(p.ctype, p.name)
+         for p in out_arrays if not p.is_const])
+    return _create_reduction_function_code(
+        name, block_size, reduce_type, params, arginfos, identity,
+        map_expr, reduce_expr, post_map_expr,
+        type_map, input_expr, output_expr, preamble, options)
+@_util.memoize(for_each_device=True)
+def _ReductionKernel_get_cached_function(
+        nin, nout, params, arginfos, _kernel._TypeMap type_map,
+        name, block_size, reduce_type, identity, map_expr, reduce_expr,
+        post_map_expr, preamble, options):
+    code = _ReductionKernel_get_cached_function_code(
+        nin, nout, params, arginfos, type_map,
+        name, block_size, reduce_type, identity, map_expr, reduce_expr,
+        post_map_expr, preamble, options)
+    return _create_reduction_function_from_code(name, code, options)
--- a/cupy/_core/_routines_binary.pxd
+++ b/cupy/_core/_routines_binary.pxd
+cdef object _bitwise_and
+cdef object _bitwise_or
+cdef object _bitwise_xor
+cdef object _invert
+cdef object _left_shift
+cdef object _right_shift
--- a/cupy/_core/_routines_binary.pyx
+++ b/cupy/_core/_routines_binary.pyx
+from cupy._core._kernel import create_ufunc
+cdef _create_bit_op(name, op, no_bool, doc='', scatter_op=None):
+    types = () if no_bool else ('??->?',)
+    return create_ufunc(
+        'cupy_' + name,
+        types + ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
+                 'LL->L', 'qq->q', 'QQ->Q'),
+        'out0 = in0 %s in1' % op,
+        doc=doc, scatter_op=scatter_op)
+cdef _bitwise_and = _create_bit_op(
+    'bitwise_and', '&', False,
+    '''Computes the bitwise AND of two arrays elementwise.
+    Only integer and boolean arrays are handled.
+    .. seealso:: :data:`numpy.bitwise_and`
+    ''',
+    scatter_op='and')
+cdef _bitwise_or = _create_bit_op(
+    'bitwise_or', '|', False,
+    '''Computes the bitwise OR of two arrays elementwise.
+    Only integer and boolean arrays are handled.
+    .. seealso:: :data:`numpy.bitwise_or`
+    ''',
+    scatter_op='or')
+cdef _bitwise_xor = _create_bit_op(
+    'bitwise_xor', '^', False,
+    '''Computes the bitwise XOR of two arrays elementwise.
+    Only integer and boolean arrays are handled.
+    .. seealso:: :data:`numpy.bitwise_xor`
+    ''',
+    scatter_op='xor')
+cdef _invert = create_ufunc(
+    'cupy_invert',
+    (('?->?', 'out0 = !in0'), 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I',
+     'l->l', 'L->L', 'q->q', 'Q->Q'),
+    'out0 = ~in0',
+    doc='''Computes the bitwise NOT of an array elementwise.
+    Only integer and boolean arrays are handled.
+    .. note::
+        :func:`cupy.bitwise_not` is an alias for :func:`cupy.invert`.
+    .. seealso:: :data:`numpy.invert`
+    ''')
+cdef _left_shift = _create_bit_op(
+    'left_shift', '<<', True,
+    '''Shifts the bits of each integer element to the left.
+    Only integer arrays are handled.
+    .. seealso:: :data:`numpy.left_shift`
+    ''')
+cdef _right_shift = _create_bit_op(
+    'right_shift', '>>', True,
+    '''Shifts the bits of each integer element to the right.
+    Only integer arrays are handled
+    .. seealso:: :data:`numpy.right_shift`
+    ''')
+# Variables to expose to Python
+# (cythonized data cannot be exposed to Python, even with cpdef.)
+bitwise_and = _bitwise_and
+bitwise_or = _bitwise_or
+bitwise_xor = _bitwise_xor
+invert = _invert
+left_shift = _left_shift
+right_shift = _right_shift
--- a/cupy/_core/_routines_indexing.pxd
+++ b/cupy/_core/_routines_indexing.pxd
+from cupy._core.core cimport _ndarray_base
+cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self)
+cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices)
+cdef _ndarray_setitem(_ndarray_base self, slices, value)
+cdef tuple _ndarray_nonzero(_ndarray_base self)
+cdef _scatter_op(_ndarray_base a, slices, value, op)
+cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out)
+cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode)
+cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode)
+cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out)
+cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2)
+cdef _ndarray_base _add_reduceat(
+    _ndarray_base array, indices, axis, dtype, out)