clone code from github

deb763b7 · root · 93bf084b · deb763b7 · deb763b7 · deb763b7
Commit deb763b7 authored Feb 06, 2026 by root
20 changed files
--- a/cupy/_core/_fusion_interface.py
+++ b/cupy/_core/_fusion_interface.py
+import numpy
+from cupy._core._dtype import get_dtype
+import cupy
+from cupy._core import _fusion_thread_local
+from cupy._core import core
+from cupy._core._scalar import get_typename
+_thread_local = _fusion_thread_local.thread_local
+_dtype_to_astype_dict = None
+def _set_dtype_to_astype_dict():
+    """Set a dict with dtypes and astype ufuncs to `_dtype_to_astype_dict`.
+    Creates a ufunc for type cast operations, and set a dict with keys
+    as the dtype of the output array and values as astype ufuncs.
+    This function is called at most once.
+    """
+    global _dtype_to_astype_dict
+    _dtype_to_astype_dict = {}
+    dtype_list = [numpy.dtype(type_char) for type_char in '?bhilqBHILQefdFD']
+    for t in dtype_list:
+        name = 'astype_{}'.format(t)
+        rules = tuple(['{}->{}'.format(s.char, t.char) for s in dtype_list])
+        command = 'out0 = static_cast< {} >(in0)'.format(get_typename(t))
+        _dtype_to_astype_dict[t] = core.create_ufunc(name, rules, command)
+class _VariableProxy:
+    """Abstracted array/scalar object passed to the target function.
+    """
+    def __init__(self, content):
+        assert isinstance(content, cupy._core._fusion_variable._TraceVariable)
+        self.content = content
+    def __neg__(self):
+        return cupy.negative(self)
+    def __add__(self, other):
+        return cupy.add(self, other)
+    def __radd__(self, other):
+        return cupy.add(other, self)
+    def __sub__(self, other):
+        return cupy.subtract(self, other)
+    def __rsub__(self, other):
+        return cupy.subtract(other, self)
+    def __mul__(self, other):
+        return cupy.multiply(self, other)
+    def __rmul__(self, other):
+        return cupy.multiply(other, self)
+    def __div__(self, other):
+        return cupy.divide(self, other)
+    def __rdiv__(self, other):
+        return cupy.divide(other, self)
+    def __truediv__(self, other):
+        return cupy.true_divide(self, other)
+    def __rtruediv__(self, other):
+        return cupy.true_divide(other, self)
+    def __floordiv__(self, other):
+        return cupy.floor_divide(self, other)
+    def __rfloordiv__(self, other):
+        return cupy.floor_divide(other, self)
+    def __mod__(self, other):
+        return cupy.remainder(self, other)
+    def __rmod__(self, other):
+        return cupy.remainder(other, self)
+    def __pow__(self, other):
+        return cupy.power(self, other)
+    def __lshift__(self, other):
+        return cupy.left_shift(self, other)
+    def __rlshift__(self, other):
+        return cupy.left_shift(other, self)
+    def __rshift__(self, other):
+        return cupy.right_shift(self, other)
+    def __rrshift__(self, other):
+        return cupy.right_shift(other, self)
+    def __invert__(self):
+        return cupy.invert(self)
+    def __and__(self, other):
+        return cupy.bitwise_and(self, other)
+    def __rand__(self, other):
+        return cupy.bitwise_and(other, self)
+    def __or__(self, other):
+        return cupy.bitwise_or(self, other)
+    def __ror__(self, other):
+        return cupy.bitwise_or(other, self)
+    def __xor__(self, other):
+        return cupy.bitwise_xor(self, other)
+    def __rxor__(self, other):
+        return cupy.bitwise_xor(other, self)
+    def __lt__(self, other):
+        return cupy.less(self, other)
+    def __le__(self, other):
+        return cupy.less_equal(self, other)
+    def __eq__(self, other):
+        return cupy.equal(self, other)
+    def __ne__(self, other):
+        return cupy.not_equal(self, other)
+    def __ge__(self, other):
+        return cupy.greater_equal(self, other)
+    def __gt__(self, other):
+        return cupy.greater(self, other)
+    def copy(self):
+        return cupy.copy(self)
+    def astype(self, dtype, order=None, casting=None, subok=None, copy=True):
+        dtype = get_dtype(dtype)
+        if order is not None:
+            raise TypeError('order is not supported yet')
+        if casting is not None:
+            raise TypeError('casting is not supported yet')
+        if subok is not None:
+            raise TypeError('subok is not supported yet')
+        if not copy and self.dtype == dtype:
+            return self
+        if _dtype_to_astype_dict is None:
+            _set_dtype_to_astype_dict()
+        return _dtype_to_astype_dict[dtype](self)
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):
+        return cupy.sum(
+            self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):
+        return cupy.prod(
+            self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
+    def max(self, axis=None, out=None, keepdims=False):
+        return cupy.max(self, axis=axis, out=out, keepdims=keepdims)
+    def min(self, axis=None, out=None, keepdims=False):
+        return cupy.min(self, axis=axis, out=out, keepdims=keepdims)
+    def all(self, axis=None, out=None, keepdims=False):
+        return cupy.all(self, axis=axis, out=out, keepdims=keepdims)
+    def any(self, axis=None, out=None, keepdims=False):
+        return cupy.any(self, axis=axis, out=out, keepdims=keepdims)
+    @property
+    def dtype(self):
+        return self.content.dtype
+    @property
+    def ndim(self):
+        return self.content.ndim
+    @property
+    def shape(self):
+        raise NotImplementedError('`shape` is not supported, currently.')
+class _ScalarProxy(_VariableProxy):
+    """An abstracted scalar object passed to the target function.
+    Attributes:
+        dtype(dtype): The dtype of the array.
+        imag(_ArrayProxy): The imaginary part of the array (Not implemented)
+        real(_ArrayProxy): The real part of the array (Not implemented)
+        ndim(int): The number of dimensions of the array.
+    """
+    def __repr__(self):
+        return '_ScalarProxy({}, dtype={})'.format(
+            self._emit_param_name(), self.dtype)
+class _ArrayProxy(_VariableProxy):
+    """An abstracted array object passed to the target function.
+    Attributes:
+        dtype(dtype): The dtype of the array.
+        imag(_ArrayProxy): The imaginary part of the array (Not implemented)
+        real(_ArrayProxy): The real part of the array (Not implemented)
+        ndim(int): The number of dimensions of the array.
+    """
+    def __repr__(self):
+        return '_ArrayProxy([...], dtype=\'{}\', ndim={})'.format(
+            self.dtype.char, self.ndim)
+    def _inplace_op(self, ufunc, other):
+        return ufunc(self, other, self)
+    def __iadd__(self, other):
+        return self._inplace_op(cupy.add, other)
+    def __isub__(self, other):
+        return self._inplace_op(cupy.subtract, other)
+    def __imul__(self, other):
+        return self._inplace_op(cupy.multiply, other)
+    def __idiv__(self, other):
+        return self._inplace_op(cupy.divide, other)
+    def __itruediv__(self, other):
+        return self._inplace_op(cupy.true_divide, other)
+    def __ifloordiv__(self, other):
+        return self._inplace_op(cupy.floor_divide, other)
+    def __imod__(self, other):
+        return self._inplace_op(cupy.remainder, other)
+    def __ipow__(self, other):
+        return self._inplace_op(cupy.power, other)
+    def __ilshift__(self, other):
+        return self._inplace_op(cupy.left_shift, other)
+    def __irshift__(self, other):
+        return self._inplace_op(cupy.right_shift, other)
+    def __iand__(self, other):
+        return self._inplace_op(cupy.bitwise_and, other)
+    def __ior__(self, other):
+        return self._inplace_op(cupy.bitwise_or, other)
+    def __ixor__(self, other):
+        return self._inplace_op(cupy.bitwise_xor, other)
+    def __getitem__(self, index):
+        return _fusion_thread_local.call_indexing(self, index)
+    def __setitem__(self, slices, value):
+        if slices is Ellipsis or (
+                isinstance(slices, slice) and slices == slice(None)):
+            _fusion_thread_local.call_ufunc(
+                core.elementwise_copy, value, out=self)
+        else:
+            raise ValueError('The fusion supports `[...]` or `[:]`.')
--- a/cupy/_core/_fusion_kernel.pyx
+++ b/cupy/_core/_fusion_kernel.pyx
+import itertools
+import string
+from libcpp cimport vector
+from cupy._core cimport _carray
+from cupy._core.core cimport _ndarray_init
+from cupy._core.core cimport compile_with_cache
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport internal
+from cupy._core cimport _routines_manipulation as _manipulation
+from cupy_backends.cuda.api cimport driver
+from cupy_backends.cuda.api cimport runtime
+import cupy as _cupy
+from cupy._core import _dtype
+from cupy import _util
+from cupy._core import _codeblock
+from cupy._core import _fusion_op
+from cupy._core._fusion_variable import _TraceVariable
+from cupy._core._fusion_variable import _TraceScalar
+from cupy._core._fusion_variable import _TraceArray
+cdef Py_ssize_t _default_block_size = (
+    256 if runtime._is_hip_environment else 512)
+@_util.memoize(for_each_device=True)
+def _cuda_compile(preamble, name, cuda_params, cuda_body, use_grid_sync):
+    template = (
+        '${preamble}\n\n'
+        'extern "C" __global__ void ${name}(${cuda_params}) ${cuda_body}\n'
+    )
+    if use_grid_sync:
+        template = '#include <cooperative_groups.h>\n\n' + template
+    code = string.Template(template).substitute(
+        preamble=preamble,
+        name=name,
+        cuda_params=cuda_params,
+        cuda_body=cuda_body)
+    # (For contributers) We can view the whole generated CUDA code
+    # by uncommenting the following line.
+    # print(code)
+    module = compile_with_cache(
+        code, (), None, None, True, 'nvrtc', False, use_grid_sync)
+    return module.get_function(name)
+cdef class FusedKernel:
+    cdef:
+        readonly object shape_constraints
+        readonly str _name
+        readonly list _params
+        readonly int _return_size
+        readonly str _submodule_code
+        readonly str _cuda_body
+        readonly dict _cuda_params_memo
+        readonly list _block_strides
+        readonly bint _use_grid_sync
+        readonly list _reduction_in_array
+        readonly list _reduction_out_array
+        readonly vector.vector[bint] _is_base
+        readonly list _dtypes
+        readonly vector.vector[Py_ssize_t] _input_index
+        readonly vector.vector[Py_ssize_t] _view_of
+        readonly vector.vector[Py_ssize_t] _out_params
+    def __init__(self, name, trace_result):
+        op_list = trace_result.op_list
+        params = trace_result.params
+        return_size = trace_result.return_size
+        self.shape_constraints = trace_result.shape_constraints
+        self._name = name
+        self._params = sorted(params, key=lambda x: x.serial_number)
+        self._cuda_params_memo = {}
+        # Generate the device functions.
+        submodule_code = '\n\n'.join(set(itertools.chain.from_iterable([
+            op.emit_preamble_codes() for op in op_list]))) + '\n\n'
+        submodule_code += '\n\n'.join(itertools.chain.from_iterable([
+            op.emit_submodule_codes() for op in op_list]))
+        # Generate the function body of a __global__ function.
+        codes = []
+        self._use_grid_sync = len(op_list) > 1
+        if self._use_grid_sync:
+            codes.append('namespace _cg = cooperative_groups;')
+            codes.append('_cg::grid_group _grid = _cg::this_grid();')
+        for i, op in enumerate(op_list):
+            if i > 0:
+                codes.append('_cg::sync(_grid);')
+            codes.append(op.emit_code())
+        self._submodule_code = submodule_code
+        self._cuda_body = str(_codeblock.CodeBlock('', codes))
+        # Check the format of the return value.
+        if return_size == 'none':
+            self._return_size = -1
+            self._out_params.resize(0)
+        elif return_size == 'single':
+            self._return_size = -2
+            self._out_params.resize(1)
+        else:
+            assert isinstance(return_size, int)
+            assert return_size >= 0
+            self._return_size = return_size
+            self._out_params.resize(return_size)
+        for p in self._params:
+            assert isinstance(p, _TraceVariable)
+        # Analyse the relationship between variables.
+        array_dict = {}
+        self._reduction_in_array = []
+        self._reduction_out_array = []
+        self._dtypes = []
+        for i, p in enumerate(self._params):
+            view_of = -1
+            input_index = -1
+            if p.input_index is not None:
+                input_index = p.input_index
+            if isinstance(p, _TraceArray):
+                if p._view_of is not None:
+                    view_of = array_dict[p._view_of.key()]
+                if p.is_output:
+                    self._out_params[p.output_index] = i
+            array_dict[p.key()] = i
+            self._is_base.push_back(p.is_base)
+            self._dtypes.append(_dtype.get_dtype(p.dtype))
+            self._input_index.push_back(input_index)
+            self._view_of.push_back(view_of)
+        self._block_strides = []
+        for op in op_list:
+            if isinstance(op, _fusion_op._ReductionTraceOp):
+                self._reduction_in_array.append(
+                    array_dict[op.in_params.item().key()])
+                self._reduction_out_array.append(
+                    array_dict[op.out_params.item().key()])
+                self._block_strides.append(
+                    'int {}'.format(op.block_stride_name))
+    def get_shapes_of_kernel_params(self, tuple args):
+        """Returns the shapes of parameters passed to kern.linear_launch.
+        """
+        cdef list kernel_param_shapes = []
+        cdef int axis
+        cdef list shape
+        for param in self._params:
+            shape = []
+            if isinstance(param, _TraceArray):
+                ashape = param.ashape
+                for axis in range(len(ashape)):
+                    dim = ashape[axis]
+                    if not isinstance(dim, int):
+                        dim = args[dim.input_index].shape[dim.axis]
+                    shape.append(dim)
+            kernel_param_shapes.append(tuple(shape))
+        return kernel_param_shapes
+    cdef list _get_ndarray_list(self, tuple args, list shapes):
+        """Get the list of ndarray corresponding to ``self._params``.
+        """
+        cdef list ndarray_list = []
+        cdef list params = self._params
+        cdef int i
+        for i in range(len(params)):
+            param = params[i]
+            shape = shapes[i]
+            if self._input_index[i] >= 0:
+                array = args[<Py_ssize_t>self._input_index[i]]
+            elif isinstance(param, _TraceScalar):
+                array = None
+            elif self._is_base[i]:
+                array = _ndarray_init(
+                    _cupy.ndarray, shape, self._dtypes[i], None)
+            else:
+                view_of = ndarray_list[<Py_ssize_t>self._view_of[i]]
+                if param.is_broadcast:
+                    array = _manipulation.broadcast_to(view_of, shape)
+                elif param.slice_key is not None:
+                    array = view_of[param.slice_key]
+                elif param.rotate_axis is not None:
+                    axis_permutes = list(param.rotate_axis)
+                    for i in range(param.ndim):
+                        if i not in param.rotate_axis:
+                            axis_permutes.append(i)
+                    axis_permutes = tuple(axis_permutes)
+                    array = _manipulation._transpose(view_of, axis_permutes)
+                else:
+                    assert False
+            # For debug
+            # if isinstance(array, ndarray) and param.rotate_axis is None:
+            #     assert array.shape == shape, (array.shape, shape)
+            ndarray_list.append(array)
+        return ndarray_list
+    cdef object _get_return_value(self, list ndarray_list):
+        """Get the return value of ``self.execute``.
+        """
+        cdef int i
+        if self._return_size == -1:
+            return None
+        if self._return_size == -2:
+            return ndarray_list[<Py_ssize_t>self._out_params[0]]
+        return tuple([
+            ndarray_list[<Py_ssize_t>self._out_params[i]]
+            for i in range(self._return_size)
+        ])
+    cdef tuple _get_kernel_size(self, list ndarray_list):
+        """Calculate the numnber of contiguous blocks in non-reduction axes
+        of input arrays, and set them to ``self._contiguous_size``.
+        """
+        cdef _ndarray_base in_array, out_array
+        cdef Py_ssize_t block_size, block_stride, contiguous_size
+        cdef list block_strides = []
+        if len(self._reduction_in_array) == 0:
+            return [], 256, 0
+        block_size = _default_block_size
+        for i in range(len(self._reduction_in_array)):
+            in_array = ndarray_list[self._reduction_in_array[i]]
+            out_array = ndarray_list[self._reduction_out_array[i]]
+            # TODO(asi1024): Fix block strides for performance.
+            contiguous_size = 1
+            itemsize = in_array.dtype.itemsize
+            for i in range(out_array.ndim):
+                if in_array.strides[-i-1] != contiguous_size * itemsize:
+                    break
+                contiguous_size *= in_array.shape[-i-1]
+            contiguous_size = min(contiguous_size, 32)
+            reduce_block_size = max(1, in_array.size // max(1, out_array.size))
+            block_stride = max(
+                contiguous_size, block_size // reduce_block_size)
+            block_stride = internal.clp2(block_stride // 2 + 1)  # floor
+            block_strides.append(block_stride)
+        shared_mem = block_size * 32  # max bytesize of reduce_ctype.
+        return block_strides, block_size, shared_mem
+    cdef tuple _reduce_dims(self, list ndarray_list):
+        """Reduce number of dimensions of ndarrays and returns the cache key.
+        """
+        cdef list params = self._params
+        cdef list ndims = []
+        cdef _ndarray_base array
+        cdef int i
+        for i in range(len(params)):
+            param = params[i]
+            if param.ndim <= 1:
+                continue
+            array = ndarray_list[i]
+            array = array.reduced_view()
+            ndarray_list[i] = array
+            ndims.append(array.ndim)
+        return tuple(ndims)
+    cdef list _get_inout_args(self, tuple args, list ndarray_list):
+        """Get the arguments passed to ``kern.linear_launch``.
+        """
+        cdef list params = []
+        cdef list indexers = []
+        cdef _carray.Indexer indexer
+        for i in range(len(self._params)):
+            array = ndarray_list[i]
+            if isinstance(array, _ndarray_base):
+                indexer = _carray.Indexer.__new__(_carray.Indexer)
+                indexer.init(array._shape)
+                indexers.append(indexer)
+                params.append(array)
+            elif self._input_index[i] >= 0:
+                obj = args[<Py_ssize_t>self._input_index[i]]
+                params.append(obj)
+        return params + indexers
+    cdef str _get_cuda_params(self, tuple key, list ndarray_list):
+        """Get a string of parameters of CUDA main function code.
+        """
+        cdef int i
+        if key in self._cuda_params_memo:
+            return self._cuda_params_memo[key]
+        cuda_params = []
+        indexers = []
+        for i in range(len(self._params)):
+            a = self._params[i]
+            if isinstance(a, _TraceArray):
+                array = ndarray_list[i]
+                ndim = array.ndim
+                c_contiguous = 'true' if array._c_contiguous else 'false'
+                index_32_bits = 'true' if array._index_32_bits else 'false'
+                cuda_params.append(a.format(
+                    'CArray<${type}, ${ndim}, ${cont}, ${ind32}> ${var}',
+                    ndim=ndim, cont=c_contiguous, ind32=index_32_bits))
+                indexers.append(
+                    a.format('CIndexer<${ndim}> ${indexer}', ndim=ndim))
+            elif isinstance(a, _TraceScalar):
+                if a.const_value is None:
+                    cuda_params.append(a.format('${type} ${var}'))
+            else:
+                raise TypeError('Unknown type {}.'.format(type(a)))
+        ret = cuda_params + indexers + self._block_strides
+        ret = ', '.join(ret)
+        self._cuda_params_memo[key] = ret
+        return ret
+    def execute(self, tuple args, list shapes):
+        ndarray_list = self._get_ndarray_list(args, shapes)
+        ret = self._get_return_value(ndarray_list)
+        reduce_key = self._reduce_dims(ndarray_list)
+        inout_args = self._get_inout_args(args, ndarray_list)
+        cuda_params = self._get_cuda_params(reduce_key, ndarray_list)
+        kern = _cuda_compile(
+            self._submodule_code, self._name, cuda_params, self._cuda_body,
+            self._use_grid_sync)
+        block_strides, block_size, shared_mem = (
+            self._get_kernel_size(ndarray_list))
+        # TODO(asi1024): Optimize kernel size parameter.
+        if not runtime._is_hip_environment:
+            kern_size = driver.occupancyMaxActiveBlocksPerMultiprocessor(
+                kern.ptr, block_size, shared_mem) * block_size
+        else:
+            # In HIP sometimes the occupancy calc seems to be broken
+            kern_size = block_size * 512
+        kargs = inout_args + block_strides
+        kern.linear_launch(
+            kern_size, kargs, shared_mem, block_size,
+            enable_cooperative_groups=self._use_grid_sync)
+        return ret
--- a/cupy/_core/_fusion_op.py
+++ b/cupy/_core/_fusion_op.py
+import string
+import numpy
+from cupy._core import _codeblock
+from cupy._core._fusion_variable import _TraceVariable
+from cupy._core._fusion_variable import _TraceArray
+from cupy._core._fusion_variable import _VariableSet
+from cupy._core import _fusion_thread_local
+from cupy._core import _kernel
+from cupy._core import _reduction
+from cupy._core._scalar import get_typename
+class _UfuncRoutine:
+    """A device function for single elementwise operations.
+    """
+    def __init__(
+            self, name, ufunc, routine_code, in_params, out_params,
+            compute_dtypes):
+        assert isinstance(name, str)
+        assert isinstance(ufunc, _kernel.ufunc)
+        assert isinstance(routine_code, str)
+        assert isinstance(compute_dtypes, tuple)
+        assert all(isinstance(t, numpy.dtype) for t in compute_dtypes)
+        assert isinstance(in_params, list)
+        assert all(isinstance(p, _TraceVariable) for p in in_params)
+        assert isinstance(out_params, list)
+        assert all(isinstance(p, _TraceArray) for p in out_params)
+        self.name = name
+        self.in_params = in_params
+        self.out_params = out_params
+        self.preamble = ufunc._preamble
+        self.routine_code = routine_code
+        self.compute_dtypes = compute_dtypes
+    def emit_code(self):
+        """Returns a CUDA device function code.
+        Returns a string like:
+        ```
+        __device__ void cupy_add_0(int &in0_, float &in1_, double &out0_) {
+            typedef double in0_type;
+            typedef double in1_type;
+            typedef double out0_type;
+            double in0 = (double) in0_;
+            double in1 = (double) in1_;
+            double out0 = (double) out0_;
+            out0 = in0 + in1;
+            out0_ = out0;
+        }
+        ```
+        """
+        nin = len(self.in_params)
+        dtypes = self.compute_dtypes
+        assert len(self.in_params) == len(self.compute_dtypes[:nin])
+        in_params = [
+            (get_typename(p.dtype), get_typename(t), 'in{}'.format(i))
+            for i, (p, t) in enumerate(zip(self.in_params, dtypes[:nin]))
+        ]
+        out_params = [
+            (get_typename(p.dtype), get_typename(t), 'out{}'.format(i))
+            for i, (p, t) in enumerate(zip(self.out_params, dtypes[nin:]))
+        ]
+        params = in_params + out_params
+        params_code = ', '.join(['{} &{}_'.format(t, s) for t, _, s in params])
+        typedef = ['typedef {} {}_type;'.format(t, s) for _, t, s in params]
+        read = ['{} {} = ({}) {}_;'.format(t, s, t, s) for _, t, s in params]
+        write = ['{}_ = {};'.format(s, s) for _, _, s in out_params]
+        return _codeblock.CodeBlock(
+            '__device__ void {}({})'.format(self.name, params_code),
+            typedef + read + [self.routine_code + ';'] + write)
+    def emit_call_code(self):
+        params = self.in_params + self.out_params
+        return '{op_name}({params});'.format(
+            op_name=self.name,
+            params=', '.join([var.lvar_name for var in params]))
+class _ElementwiseTraceOp:
+    """Ufunc or elementwise kernel with types.
+    """
+    def __init__(self, ufunc_routines, in_params, out_params, ashape):
+        # The `in_params` and `out_params` should be already broadcasted to
+        # `ashape`, but they don't guarantee to be exactly same as
+        # `param.ashape`.
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(ufunc_routines, list)
+        assert all(isinstance(r, _UfuncRoutine) for r in ufunc_routines)
+        assert isinstance(ashape, tuple)
+        self.ops = ufunc_routines
+        self.in_params = _VariableSet(*in_params)
+        self.out_params = _VariableSet(*out_params)
+        self.ashape = ashape
+    @property
+    def params(self):
+        """Returns the set of all variable the loop uses.
+        """
+        res = _VariableSet()
+        for op in self.ops:
+            res += _VariableSet(*op.in_params)
+            res += _VariableSet(*op.out_params)
+        return res
+    @staticmethod
+    def _emit_declaration(params, in_params):
+        """Returns a tuple of size 2.
+        1. CUDA code: declaring local variables.
+            2. The set of arrays which require indexer.
+        """
+        _fusion_thread_local.check_not_runtime()
+        indexed_arrays = _VariableSet()
+        code = []
+        for var in params:
+            if var in in_params:
+                if isinstance(var, _TraceArray):
+                    indexed_arrays.add(var)
+                    f = '${type} ${lvar} = ${var}[${indexer}.get()];'
+                else:
+                    f = '${type} ${lvar} = ${var};'
+            else:
+                f = '${type} ${lvar};'
+            code.append(var.format(f))
+        return code, indexed_arrays
+    @staticmethod
+    def _emit_after_operation(out_params):
+        """Returns a tuple of size 2.
+        1. CUDA code: writing the results of operations back to global memory.
+        2. The set of arrays which require indexer.
+        """
+        _fusion_thread_local.check_not_runtime()
+        indexed_arrays = _VariableSet()
+        codes = []
+        for var in out_params:
+            if isinstance(var, _TraceArray):
+                indexed_arrays.add(var)
+                f = '${var}[${indexer}.get()] = ${lvar};'
+            else:
+                f = '${var} = ${lvar};'
+            codes.append(var.format(f))
+        return codes, indexed_arrays
+    @staticmethod
+    def _emit_set_index(indexed_params, tid):
+        """Returns a CUDA code: setting a raw index to indexers.
+        """
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(indexed_params, _VariableSet)
+        return [
+            p.format('${indexer}.set(${tid});', tid=tid)
+            for p in indexed_params
+        ]
+    def emit_code(self):
+        _fusion_thread_local.check_not_runtime()
+        declaration, s1 = self._emit_declaration(self.params, self.in_params)
+        operation = [op.emit_call_code() for op in self.ops]
+        after_operation, s2 = self._emit_after_operation(self.out_params)
+        index_name = 'i'
+        indexed_array = s1 + s2
+        indexer_name = next(iter(indexed_array)).indexer_name
+        indexer_setup = self._emit_set_index(indexed_array, index_name)
+        return _codeblock.CodeBlock(
+            'CUPY_FOR({}, {}.size())'.format(index_name, indexer_name),
+            indexer_setup + declaration + operation + after_operation)
+    def emit_preamble_codes(self):
+        return [subm.preamble for subm in self.ops if subm.preamble != '']
+    def emit_submodule_codes(self):
+        return [str(subm.emit_code()) for subm in self.ops]
+class _ReductionTraceOp:
+    def __init__(self, name, reduce_func, expr, in_param, out_param, axis):
+        """Reduction operation.
+        """
+        _fusion_thread_local.check_not_runtime()
+        assert isinstance(name, str)
+        assert isinstance(reduce_func, _reduction._SimpleReductionKernel)
+        assert isinstance(in_param, _TraceArray)
+        assert isinstance(out_param, _TraceArray)
+        assert isinstance(axis, tuple)
+        assert all(0 <= x < in_param.ndim for x in axis)
+        self.name = name
+        self.preamble = reduce_func.preamble
+        self.in_params = _VariableSet(in_param)
+        self.out_params = _VariableSet(out_param)
+        self.block_stride_name = 'block_stride_' + name
+        self.axis = axis
+        if reduce_func.identity is None:
+            self.identity = ''
+        else:
+            self.identity = str(reduce_func.identity)
+        _, self.expr, self.postmap_cast_code, self.reduce_ctype = expr
+        if self.reduce_ctype is None:
+            out_param, = self.out_params
+            self.reduce_ctype = get_typename(out_param.dtype)
+        self.premap_op = None
+        self.postmap_op = None
+    @property
+    def params(self):
+        return self.in_params + self.out_params
+    def emit_code(self):
+        _fusion_thread_local.check_not_runtime()
+        assert len(self.in_params) == 1
+        assert len(self.out_params) == 1
+        in_param = list(self.in_params)[0]
+        out_param = list(self.out_params)[0]
+        params = ', '.join([
+            in_param.var_name,
+            out_param.var_name,
+            in_param.indexer_name,
+            out_param.indexer_name,
+        ])
+        return '{}({}, {});'.format(
+            self.name, params, self.block_stride_name)
+    def emit_preamble_codes(self):
+        preamble = self.preamble
+        return [preamble] if preamble != '' else []
+    def emit_submodule_codes(self):
+        """Returns a CUDA device function code.
+        The emitted code assumes that ``block_stride`` and `blockDim.x` is a
+        power of 2.
+        """
+        in_param, = self.in_params
+        out_param, = self.out_params
+        op_name = '{}_op'.format(self.name)
+        postmap_name = '{}_postmap'.format(self.name)
+        template = string.Template('''
+#define ${op_name}(a, b) (${reduce_expr})
+#define ${postmap_name}(a, out0) (${postmap_cast})
+template <typename InType, typename OutType, typename InIndexerType, typename OutIndexerType>
+__device__ void ${name}(
+        InType in_arr, OutType out_arr,
+        InIndexerType in_ind, OutIndexerType out_ind, int block_stride) {
+    typedef ${in_type} type_in0_raw;
+    typedef ${out_type} type_out0_raw;
+    typedef ${reduce_ctype} _type_reduce;
+    extern __shared__ char _sdata_raw[];
+    _type_reduce *sdata = reinterpret_cast<_type_reduce*>(_sdata_raw);
+    unsigned int tid = threadIdx.x;
+    int _J = tid >> __popc(block_stride - 1);
+    ptrdiff_t _j = (ptrdiff_t)_J * out_ind.size();
+    int J_stride = blockDim.x >> __popc(block_stride - 1);
+    ptrdiff_t j_stride = (ptrdiff_t)J_stride * out_ind.size();
+    for (ptrdiff_t _i = (ptrdiff_t)blockIdx.x * block_stride; _i < out_ind.size(); _i += (ptrdiff_t)gridDim.x * block_stride) {
+        _type_reduce s = _type_reduce(${identity});
+        ptrdiff_t i = _i + (tid & (block_stride - 1));
+        for (ptrdiff_t j = i + _j; j < in_ind.size(); j += j_stride) {
+            in_ind.set(j);
+            s = ${op_name}(s, static_cast<_type_reduce>(in_arr[in_ind.get()]));
+        }
+        sdata[tid] = s;
+        __syncthreads();
+        for (unsigned int block = blockDim.x / 2; block >= block_stride; block >>= 1) {
+            if (tid < block) {
+                sdata[tid] = ${op_name}(sdata[tid], sdata[tid + block]);
+            }
+            __syncthreads();
+        }
+        if (tid < block_stride) {
+            s = sdata[tid];
+        }
+        if (tid < block_stride && i < out_ind.size()) {
+            out_ind.set(i);
+            ${postmap_name}(s, out_arr[out_ind.get()]);
+        }
+        __syncthreads();
+    }
+}''')  # NOQA
+        code = template.substitute(
+            name=self.name,
+            op_name=op_name,
+            postmap_name=postmap_name,
+            in_type=get_typename(in_param.dtype),
+            out_type=get_typename(out_param.dtype),
+            reduce_ctype=self.reduce_ctype,
+            reduce_expr=self.expr,
+            identity=self.identity,
+            postmap_cast=self.postmap_cast_code
+        )
+        return [code]
--- a/cupy/_core/_fusion_optimization.py
+++ b/cupy/_core/_fusion_optimization.py
+from cupy._core import _fusion_variable
+from cupy._core import _fusion_op
+def _reduce_memory_access(ops):
+    required_memories = set()
+    for op in ops:
+        for p in op.in_params + op.out_params:
+            if p.memory.is_inout:
+                required_memories.add(p.memory)
+    for op in ops[::-1]:
+        in_memories = set([p.memory for p in op.in_params])
+        new_out_params = []
+        for p in op.out_params:
+            if p.memory in required_memories:
+                new_out_params.append(p)
+        op.out_params = _fusion_variable._VariableSet(*new_out_params)
+        # TODO(asi1024): The following improvement can be applicable only
+        # when the memory space is used at most once.
+        # `required_memories -= out_memories`
+        required_memories |= in_memories
+    return [op for op in ops if len(op.out_params) > 0]
+def _normalize_ashapes(ops, variables, shape_constraints):
+    def normalize(shape):
+        return tuple([shape_constraints.evaluate(d) for d in shape])
+    for var in variables:
+        var.ashape = normalize(var.ashape)
+    for op in ops:
+        if isinstance(op, _fusion_op._ElementwiseTraceOp):
+            op.ashape = normalize(op.ashape)
+def _fuse_two_ops(op1, op2):
+    """Returns a fused Op if the two ops can be fused, and ``None`` otherwise.
+    """
+    # TODO(asi1024): Supoort reduction postmap.
+    if not isinstance(op1, _fusion_op._ElementwiseTraceOp):
+        return None
+    # TODO(asi1024): Supoort reduction premap.
+    if not isinstance(op2, _fusion_op._ElementwiseTraceOp):
+        return None
+    if op1.ashape != op2.ashape:
+        return None
+    new_in_params = op1.in_params + (op2.in_params - op1.out_params)
+    new_out_params = op1.out_params + op2.out_params
+    for in_param in new_in_params:
+        for out_param in new_out_params:
+            # Checks if two arrays may share the same memory space.
+            if in_param.memory == out_param.memory and in_param != out_param:
+                return None
+    op1.ops.extend(op2.ops)
+    op1.in_params = new_in_params
+    op1.out_params = new_out_params
+    return op1
+def _fuse_consecutive_ops(ops, shape_constraints):
+    res = []
+    for op in ops:
+        if len(res) == 0:
+            res.append(op)
+        else:
+            prev_op = res.pop(-1)
+            new_op = _fuse_two_ops(prev_op, op)
+            if new_op is None:
+                res.extend([prev_op, op])
+            else:
+                res.append(new_op)
+    return res
+def optimize(ops, variables, shape_constraints):
+    _normalize_ashapes(ops, variables, shape_constraints)
+    ops = _reduce_memory_access(ops)
+    ops = _fuse_consecutive_ops(ops, shape_constraints)
+    ops = _reduce_memory_access(ops)
+    return ops
--- a/cupy/_core/_fusion_thread_local.pyx
+++ b/cupy/_core/_fusion_thread_local.pyx
+import threading
+thread_local = threading.local()
+cpdef inline bint is_old_fusing() except? -1:
+    try:
+        return thread_local.is_old_fusing
+    except AttributeError:
+        thread_local.is_old_fusing = False
+    return False
+cpdef inline bint is_new_fusing() except? -1:
+    try:
+        return thread_local.is_new_fusing
+    except AttributeError:
+        thread_local.is_new_fusing = False
+    return False
+cpdef inline bint is_fusing() except? -1:
+    return is_old_fusing() or is_new_fusing()
+def check_not_runtime():
+    assert is_new_fusing()
+def call_ufunc(fusion_op, *args, **kwargs):
+    if is_new_fusing():
+        return thread_local.history.call_ufunc(fusion_op, *args, **kwargs)
+    import cupy
+    return cupy._core.fusion._call_ufunc(fusion_op, *args, **kwargs)
+def call_reduction(fusion_op, *args, **kwargs):
+    if is_new_fusing():
+        return thread_local.history.call_reduction(fusion_op, *args, **kwargs)
+    import cupy
+    return cupy._core.fusion._call_reduction(fusion_op, *args, **kwargs)
+def call_indexing(fusion_op, *args, **kwargs):
+    return thread_local.history.call_indexing(fusion_op, *args, **kwargs)
--- a/cupy/_core/_fusion_trace.pyx
+++ b/cupy/_core/_fusion_trace.pyx
--- a/cupy/_core/_fusion_variable.pxd
+++ b/cupy/_core/_fusion_variable.pxd
+cdef class _AbstractDim:
+    cdef:
+        readonly int input_index
+        readonly int axis
--- a/cupy/_core/_fusion_variable.pyx
+++ b/cupy/_core/_fusion_variable.pyx
+import string
+import numpy
+from cupy._core import _fusion_interface
+from cupy._core._scalar cimport get_typename
+cdef class _AbstractDim:
+    """An abstracted data structure for a length of dimensions.
+    Attributes:
+        input_index (int):
+            The position of the element in the arguments passed to the
+            fused function
+        axis (int):
+            The index of dimensions
+    """
+    def __init__(self, int input_index, int axis):
+        self.input_index = input_index
+        self.axis = axis
+    def __hash__(self):
+        return hash((self.input_index, self.axis))
+    def __eq__(self, object other):
+        if isinstance(other, int):
+            return False
+        return (
+            self.input_index == other.input_index
+            and self.axis == other.axis
+        )
+class _MemorySpace:
+    """A memory space object.
+    Attributes:
+        id(int): The serial number of memory space.
+        base_serial_number(int): The serial number of the base variable
+            which have this memory space.
+        is_input(bool): If this is set to ``True``, the memory space is
+            already allocated as an input array. If this is set to ``False``,
+            the memory space should be allocated before launching the kernel.
+        is_output(bool): If this is set to ``True``, the memory space is
+            used in the return values.
+    """
+    def __init__(self, memory_id, base_serial_number):
+        assert isinstance(memory_id, int)
+        assert isinstance(base_serial_number, int)
+        self.id = memory_id
+        self.base_serial_number = base_serial_number
+        # Initially, these attributes are set to be `False`, but might be
+        # updated from outside.
+        self.is_input = False
+        self.is_output = False
+    @property
+    def is_inout(self):
+        """Returns ``True`` if the memory space is used for inputs or outputs.
+        If ``True``, the memory space should not be deallocated just after
+        the kernel launch. If ``False``, the memory space is used only for
+        temporary value in the fused kernel."""
+        return self.is_input or self.is_output
+class _TraceVariable:
+    """Variable object to trace operations in the target function to be fused.
+    Attributes:
+        index(_MemorySpace): The memory space the variable uses.
+        serial_number(int): The serial number of the variable object.
+        dtype(dtype): The dtype of the variable.
+        rshape(tuple of int): The real shape of the variable.
+        ashape(tuple of _AbstractDim): An abstracted shape of the variable.
+        input_index(int or None): If not `None`, this variable is used as
+            the `input_index`-th input parameter.
+        output_index(int or None): If not `None`, this variable is used as
+            the `output_index`-th output parameter.
+    """
+    def __init__(
+            self, memory_space, serial_number, dtype, rshape, ashape,
+            input_index, output_index):
+        assert isinstance(memory_space, _MemorySpace)
+        assert isinstance(serial_number, int)
+        assert isinstance(dtype, numpy.dtype)
+        assert input_index is None or isinstance(input_index, int)
+        assert output_index is None or isinstance(output_index, int)
+        assert isinstance(rshape, tuple)
+        assert isinstance(ashape, tuple)
+        assert len(rshape) == len(ashape)
+        for rdim, adim in zip(rshape, ashape):
+            assert isinstance(rdim, int)
+            assert isinstance(adim, (int, _AbstractDim))
+        self.memory = memory_space
+        self.serial_number = serial_number
+        self.dtype = dtype
+        self.rshape = rshape
+        self.ashape = ashape
+        self.input_index = input_index
+        self.output_index = output_index
+    @property
+    def ndim(self):
+        return len(self.ashape)
+    @property
+    def is_base(self):
+        return self.serial_number == self.memory.base_serial_number
+    @property
+    def is_input(self):
+        return self.input_index is not None
+    @property
+    def is_output(self):
+        return self.output_index is not None
+    @property
+    def var_name(self):
+        # The name of varialbe stored in global memory space.
+        raise NotImplementedError
+    @property
+    def lvar_name(self):
+        # The name of varialbe stored in registers in each thread.
+        raise NotImplementedError
+    @property
+    def indexer_name(self):
+        """The name of CUDA CIndxer variable for the variable.
+        """
+        # TODO(asi1024): Unify indexer with other variables which have the
+        # same shape, for performance improvements.
+        return 'ind{}_{}'.format(self.memory.id, self.serial_number)
+    def format(self, form, **kwargs):
+        """Returns a string following the format taken as an input.
+        """
+        kwargs = dict([
+            (k, get_typename(v) if isinstance(v, numpy.dtype) else v)
+            for k, v in kwargs.items()]
+        )
+        return string.Template(form).substitute(
+            type=get_typename(self.dtype),
+            var=self.var_name,
+            lvar=self.lvar_name,
+            indexer=self.indexer_name,
+            **kwargs
+        )
+    def __hash__(self):
+        assert False, (
+            '__hash__ is not defined. Use _VariableSet instead of '
+            'set/dict because they do not guarantee the order of contents.')
+class _TraceScalar(_TraceVariable):
+    """An abstracted scalar object.
+    Attributes:
+        const_value(scalar object or None): A compile-time constant value.
+            Actually, it is `None` iff self.is_input is `True`.
+    """
+    # TODO(asi1024): Remove index argument.
+    def __init__(
+            self, index, serial_number, dtype, input_index=None, *,
+            const_value=None,):
+        super().__init__(
+            index, serial_number, dtype, (), (), input_index, None)
+        self.const_value = const_value
+    @property
+    def var_name(self):
+        if self.const_value is None:
+            return 'a{}'.format(self.memory.id)
+        if self.dtype == '?':
+            return str(self.const_value).lower()
+        if self.dtype.kind == 'c':
+            return '{}({}, {})'.format(
+                get_typename(self.dtype),
+                self.const_value.real,
+                self.const_value.imag)
+        return str(self.const_value)
+    @property
+    def lvar_name(self):
+        return 'v{}'.format(self.memory.id)
+    def as_interface(self):
+        return _fusion_interface._ScalarProxy(self)
+    def key(self):
+        return (self.memory.id,)
+class _TraceArray(_TraceVariable):
+    """An abstracted array object.
+    Attributes:
+        broadcasted_from(_TraceArray optional): TODO
+        rotated_from(_TraceArray optional): TODO
+        axis(int optional): The axis to rotate.
+        indexed_from(_TraceArray optional): TODO
+        index_key(slice): TODO
+    """
+    def __init__(
+            self, index, serial_number, dtype, input_index=None,
+            output_index=None, *, rshape, ashape, **kwargs):
+        if ashape is None:
+            assert input_index is not None
+            ndim = len(rshape)
+            ashape = tuple([
+                _AbstractDim(input_index, axis) for axis in range(ndim)])
+        super().__init__(
+            index, serial_number, dtype, rshape, ashape,
+            input_index, output_index)
+        self._view_of = None
+        self.is_broadcast = False
+        self.rotate_axis = None
+        self.slice_key = None
+        if 'broadcasted_from' in kwargs:
+            self._view_of = kwargs.pop('broadcasted_from')
+            self.is_broadcast = True
+        elif 'rotated_from' in kwargs:
+            self._view_of = kwargs.pop('rotated_from')
+            self.rotate_axis = kwargs.pop('axis')
+        elif 'indexed_from' in kwargs:
+            self._view_of = kwargs.pop('indexed_from')
+            self.slice_key = kwargs.pop('index_key')
+        assert len(kwargs) == 0, kwargs
+    @property
+    def var_name(self):
+        return 'a{}_{}'.format(self.memory.id, self.serial_number)
+    @property
+    def lvar_name(self):
+        return 'v{}_{}'.format(self.memory.id, self.serial_number)
+    def as_interface(self):
+        return _fusion_interface._ArrayProxy(self)
+    def make_view(self, serial_number, **kwargs):
+        rshape = kwargs.pop('rshape', self.rshape)
+        ashape = kwargs.pop('ashape', self.ashape)
+        return _TraceArray(
+            self.memory, serial_number, self.dtype,
+            rshape=rshape, ashape=ashape, **kwargs)
+    def key(self):
+        """Two variables can be identified if they have the same key.
+        """
+        if isinstance(self.slice_key, tuple):
+            slice_key = []
+            for s in self.slice_key:
+                if isinstance(s, slice):
+                    if not (s.start is None
+                            and s.stop is None
+                            and s.step in (None, 1, -1)):
+                        raise NotImplementedError(
+                            'Basic slice supports only x[::] and x[::-1].')
+                    slice_key.append((s.start, s.stop, s.step))
+                else:
+                    slice_key.append(s)
+            slice_key = tuple(slice_key)
+        else:
+            slice_key = self.slice_key
+        return (
+            self.memory.id, self.ashape, self.input_index,
+            getattr(self._view_of, 'serial_number', None),
+            self.is_broadcast, self.rotate_axis, slice_key,
+        )
+class _VariableSet:
+    """A stable set of variables
+    """
+    def __init__(self, *args):
+        self.contents = []
+        for x in args:
+            assert isinstance(x, _TraceVariable)
+            if x not in self.contents:
+                self.contents.append(x)
+    def __len__(self):
+        return len(self.contents)
+    def item(self):
+        assert len(self.contents) == 1
+        return self.contents[0]
+    def add(self, x):
+        if x not in self.contents:
+            self.contents.append(x)
+    def __iadd__(self, other):
+        assert isinstance(other, _VariableSet)
+        for x in other.contents:
+            self.add(x)
+        return self
+    def __add__(self, other):
+        res = _VariableSet(*self.contents)
+        res += other
+        return res
+    def __contains__(self, elem):
+        return elem in self.contents
+    def __iter__(self):
+        return iter(self.contents)
+    def __isub__(self, other):
+        assert isinstance(other, _VariableSet)
+        for x in other.contents:
+            if x in self.contents:
+                self.contents.remove(x)
+        return self
+    def __sub__(self, other):
+        res = _VariableSet(*self.contents)
+        res -= other
+        return res
--- a/cupy/_core/_gufuncs.py
+++ b/cupy/_core/_gufuncs.py
--- a/cupy/_core/_kernel.pxd
+++ b/cupy/_core/_kernel.pxd
+from libcpp cimport vector
+from cupy._core cimport _carray
+from cupy._core cimport _scalar
+from cupy._core._carray cimport shape_t
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport memory
+from cupy.cuda cimport texture
+cdef class ParameterInfo:
+    cdef:
+        readonly str name
+        readonly object dtype
+        readonly str ctype
+        readonly bint raw
+        readonly bint is_const
+cdef enum _ArgKind:
+    ARG_KIND_NDARRAY = 1
+    ARG_KIND_INDEXER
+    ARG_KIND_SCALAR
+    ARG_KIND_POINTER
+    ARG_KIND_TEXTURE
+cdef class _ArgInfo:
+    # Holds metadata of an argument.
+    # This class is immutable and used as a part of hash keys.
+    cdef:
+        readonly _ArgKind arg_kind
+        readonly type type
+        readonly object dtype
+        readonly int ndim
+        readonly bint c_contiguous
+        readonly bint index_32_bits
+    cdef _ArgInfo _init(
+        self,
+        _ArgKind arg_kind,
+        type typ,
+        object dtype,
+        int ndim,
+        bint c_contiguous,
+        bint index_32_bits)
+    @staticmethod
+    cdef _ArgInfo from_arg(object arg)
+    @staticmethod
+    cdef _ArgInfo from_ndarray(_ndarray_base arg)
+    @staticmethod
+    cdef _ArgInfo from_scalar(_scalar.CScalar arg)
+    @staticmethod
+    cdef _ArgInfo from_indexer(_carray.Indexer arg)
+    @staticmethod
+    cdef _ArgInfo from_memptr(memory.MemoryPointer arg)
+    @staticmethod
+    cdef _ArgInfo from_texture(texture.TextureObject arg)
+    cdef _ArgInfo as_ndarray_with_ndim(self, int ndim)
+    cdef bint is_ndarray(self)
+    cdef bint is_scalar(self)
+    cdef str get_c_type(self)
+    cdef str get_param_c_type(self, ParameterInfo p)
+    cdef str get_c_var_name(self, ParameterInfo p)
+cdef class _TypeMap:
+    # Typedef mapping between C types.
+    # This class is immutable.
+    cdef:
+        tuple _pairs
+    cdef str get_typedef_code(self)
+cdef class _Op:
+    """Simple data structure that represents a kernel routine with single \
+concrete dtype mapping.
+    """
+    cdef:
+        readonly tuple in_types
+        readonly tuple out_types
+        readonly int nin
+        readonly int nout
+        readonly object routine
+        # If the type combination specified by in_types and out_types is
+        # disallowed, error_func must be set instead of routine.
+        # It's called by check_valid() method.
+        readonly object error_func
+    @staticmethod
+    cdef _Op _from_type_and_routine_or_error_func(
+        str typ, object routine, object error_func)
+    # Creates an op instance parsing a dtype mapping.
+    @staticmethod
+    cdef _Op from_type_and_routine(str typ, routine)
+    cpdef tuple get_in_dtypes(self)
+    cpdef tuple get_out_dtypes(self)
+    # Creates an op instance parsing a dtype mapping with given error function.
+    @staticmethod
+    cdef _Op from_type_and_error_func(str typ, error_func)
+    # Raises an error if error_func is given.
+    cdef check_valid(self)
+cdef class _Ops:
+    """A kernel routine representation with various dtype mappings.
+    """
+    cdef:
+        readonly tuple ops
+        readonly int nin
+        readonly int nout
+    @staticmethod
+    cdef _Ops from_tuples(object ops, routine)
+    # Queries a single op from input arguments.
+    cpdef _Op guess_routine(
+        self, str name, dict cache, list in_args, dtype, _Ops out_ops)
+    cpdef _Op _guess_routine_from_in_types(
+        self, tuple in_types, object can_cast=*)
+    cpdef _Op _guess_routine_from_dtype(self, object dtype)
+cpdef create_ufunc(name, ops, routine=*, preamble=*, doc=*,
+                   default_casting=*, loop_prep=*, out_ops=*,
+                   cutensor_op=*, scatter_op=*)
+cdef tuple _get_arginfos(list args)
+cdef str _get_kernel_params(tuple params, tuple arginfos)
+cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape)
+cdef list _get_out_args_from_optionals(
+    subtype, list out_args, tuple out_types, const shape_t& out_shape, casting,
+    obj)
+cdef list _get_out_args_with_params(
+    list out_args, tuple out_types,
+    const shape_t& out_shape, tuple out_params, bint is_size_specified)
+cdef _check_peer_access(_ndarray_base arr, int device_id)
+cdef list _preprocess_args(int dev_id, args, bint use_c_scalar)
+cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape)
--- a/cupy/_core/_kernel.pyx
+++ b/cupy/_core/_kernel.pyx
--- a/cupy/_core/_memory_range.pxd
+++ b/cupy/_core/_memory_range.pxd
+from cupy._core.core cimport _ndarray_base
+from libcpp.pair cimport pair
+cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array)
+cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b)
--- a/cupy/_core/_memory_range.pyx
+++ b/cupy/_core/_memory_range.pyx
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport memory
+from libcpp.pair cimport pair
+cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array):
+    cdef Py_ssize_t left = array.data.ptr
+    cdef Py_ssize_t right = left
+    cdef Py_ssize_t tmp
+    cdef pair[Py_ssize_t, Py_ssize_t] ret
+    cdef size_t i
+    for i in range(array._shape.size()):
+        # shape[i] != 0 is assumed
+        tmp = (array._shape[i] - 1) * array._strides[i]
+        if tmp > 0:
+            right += tmp
+        else:
+            left += tmp
+    ret.first = left
+    ret.second = right + <Py_ssize_t>array.dtype.itemsize
+    return ret
+cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b):
+    cdef memory.MemoryPointer a_data = a.data
+    cdef memory.MemoryPointer b_data = b.data
+    cdef pair[Py_ssize_t, Py_ssize_t] a_range, b_range
+    if (a_data.device_id != b_data.device_id
+            or a_data.mem.ptr != b_data.mem.ptr
+            or a.size == 0 or b.size == 0):
+        return False
+    a_range = get_bound(a)
+    b_range = get_bound(b)
+    return a_range.first < b_range.second and b_range.first < a_range.second
--- a/cupy/_core/_optimize_config.pxd
+++ b/cupy/_core/_optimize_config.pxd
+cdef object _thread_local
+cdef dict _contexts
+cdef class _OptimizationConfig:
+    cdef readonly object optimize_impl
+    cdef readonly int max_trials
+    cdef readonly float timeout
+    cdef readonly float expected_total_time_per_trial
+    cdef readonly float max_total_time_per_trial
+cdef class _OptimizationContext:
+    cdef readonly str key
+    cdef readonly _OptimizationConfig config
+    cdef readonly dict _params_map
+    cdef readonly bint _dirty
+cpdef _OptimizationContext get_current_context()
--- a/cupy/_core/_optimize_config.pyx
+++ b/cupy/_core/_optimize_config.pyx
+import pickle
+import threading
+cdef _thread_local = threading.local()
+cdef _contexts = {}
+cdef class _OptimizationConfig:
+    def __init__(
+            self, optimize_impl, *,
+            int max_trials=100,
+            float timeout=1,
+            float expected_total_time_per_trial=100 * 1e-6,
+            float max_total_time_per_trial=0.1):
+        self.optimize_impl = optimize_impl
+        self.max_trials = max_trials
+        self.timeout = timeout
+        self.expected_total_time_per_trial = expected_total_time_per_trial
+        self.max_total_time_per_trial = max_total_time_per_trial
+cdef class _OptimizationContext:
+    def __init__(self, str key, _OptimizationConfig config):
+        self.key = key
+        self.config = config
+        self._params_map = {}
+        self._dirty = False
+    def get_params(self, key):
+        return self._params_map.get(key)
+    def set_params(self, key, params):
+        self._params_map[key] = params
+        self._dirty = True
+    def save(self, filepath):
+        with open(filepath, mode='wb') as f:
+            pickle.dump((self.key, self._params_map), f)
+        self._dirty = False
+    def load(self, filepath):
+        with open(filepath, mode='rb') as f:
+            key, params_map = pickle.load(f)
+        if key != self.key:
+            raise ValueError(
+                'Optimization key mismatch {} != {}'.format(key, self.key))
+        self._params_map = params_map
+        self._dirty = False
+    def _is_dirty(self):
+        return self._dirty
+cpdef _OptimizationContext get_current_context():
+    try:
+        return _thread_local.current_context
+    except AttributeError:
+        return None
+def set_current_context(_OptimizationContext context):
+    _thread_local.current_context = context
+def get_new_context(
+        str key, object optimize_impl, dict config_dict):
+    c = _contexts.get(key)
+    if c is None:
+        config = _OptimizationConfig(optimize_impl, **config_dict)
+        c = _OptimizationContext(key, config)
+        _contexts[key] = c
+    return c
+def _clear_all_contexts_cache():
+    global _contexts
+    assert get_current_context() is None
+    _contexts = {}
--- a/cupy/_core/_reduction.pxd
+++ b/cupy/_core/_reduction.pxd
+from cupy._core._carray cimport shape_t
+from cupy._core cimport _kernel
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport function
+cdef Py_ssize_t _block_size
+cpdef tuple _get_axis(object axis, Py_ssize_t ndim)
+cpdef shape_t _get_out_shape(
+    const shape_t& shape, tuple reduce_axis, tuple out_axis, bint keepdims)
+cdef class _AbstractReductionKernel:
+    cdef:
+        readonly str name
+        public str identity
+        readonly tuple in_params
+        readonly tuple out_params
+        readonly tuple _params
+        readonly str __name__
+        readonly dict _cached_codes
+    cpdef _ndarray_base _call(
+        self,
+        list in_args, list out_args,
+        const shape_t& a_shape, axis, dtype,
+        bint keepdims, bint reduce_dims, int device_id,
+        stream, bint try_use_cub=*, bint sort_reduce_axis=*)
+    cdef void _launch(
+        self, out_block_num, block_size, block_stride,
+        in_args, out_args, in_shape, out_shape, types,
+        map_expr, reduce_expr, post_map_expr, reduce_type,
+        stream, params)
+    cdef tuple _get_expressions_and_types(
+        self, list in_args, list out_args, dtype)
+    cdef list _get_out_args(
+        self, list out_args, tuple out_types, const shape_t& out_shape)
+    cdef function.Function _get_function(
+        self,
+        tuple params, tuple arginfos, _kernel._TypeMap types,
+        str map_expr, str reduce_expr, str post_map_expr, str reduce_type,
+        Py_ssize_t block_size)
+cdef class ReductionKernel(_AbstractReductionKernel):
+    cdef:
+        readonly int nin
+        readonly int nout
+        readonly int nargs
+        readonly tuple params
+        readonly str reduce_expr
+        readonly str map_expr
+        readonly str post_map_expr
+        readonly object options
+        readonly bint reduce_dims
+        readonly object reduce_type
+        readonly str preamble
+cdef shape_t _set_permuted_args(
+    list args, tuple axis_permutes, const shape_t& shape, tuple params)
+cdef tuple _get_shape_and_strides(list in_args, list out_args)
+cdef _optimizer_copy_arg(a)
+cpdef create_reduction_func(
+    name, ops, routine=*, identity=*, preamble=*, sort_reduce_axis=*)
--- a/cupy/_core/_reduction.pyx
+++ b/cupy/_core/_reduction.pyx
--- a/cupy/_core/_routines_binary.pxd
+++ b/cupy/_core/_routines_binary.pxd
+cdef object _bitwise_and
+cdef object _bitwise_or
+cdef object _bitwise_xor
+cdef object _invert
+cdef object _left_shift
+cdef object _right_shift
--- a/cupy/_core/_routines_binary.pyx
+++ b/cupy/_core/_routines_binary.pyx
+from cupy._core._kernel import create_ufunc
+cdef _create_bit_op(name, op, no_bool, doc='', scatter_op=None):
+    types = () if no_bool else ('??->?',)
+    return create_ufunc(
+        'cupy_' + name,
+        types + ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
+                 'LL->L', 'qq->q', 'QQ->Q'),
+        'out0 = in0 %s in1' % op,
+        doc=doc, scatter_op=scatter_op)
+cdef _bitwise_and = _create_bit_op(
+    'bitwise_and', '&', False,
+    '''Computes the bitwise AND of two arrays elementwise.
+    Only integer and boolean arrays are handled.
+    .. seealso:: :data:`numpy.bitwise_and`
+    ''',
+    scatter_op='and')
+cdef _bitwise_or = _create_bit_op(
+    'bitwise_or', '|', False,
+    '''Computes the bitwise OR of two arrays elementwise.
+    Only integer and boolean arrays are handled.
+    .. seealso:: :data:`numpy.bitwise_or`
+    ''',
+    scatter_op='or')
+cdef _bitwise_xor = _create_bit_op(
+    'bitwise_xor', '^', False,
+    '''Computes the bitwise XOR of two arrays elementwise.
+    Only integer and boolean arrays are handled.
+    .. seealso:: :data:`numpy.bitwise_xor`
+    ''',
+    scatter_op='xor')
+cdef _invert = create_ufunc(
+    'cupy_invert',
+    (('?->?', 'out0 = !in0'), 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I',
+     'l->l', 'L->L', 'q->q', 'Q->Q'),
+    'out0 = ~in0',
+    doc='''Computes the bitwise NOT of an array elementwise.
+    Only integer and boolean arrays are handled.
+    .. note::
+        :func:`cupy.bitwise_not` is an alias for :func:`cupy.invert`.
+    .. seealso:: :data:`numpy.invert`
+    ''')
+cdef _left_shift = _create_bit_op(
+    'left_shift', '<<', True,
+    '''Shifts the bits of each integer element to the left.
+    Only integer arrays are handled.
+    .. seealso:: :data:`numpy.left_shift`
+    ''')
+cdef _right_shift = _create_bit_op(
+    'right_shift', '>>', True,
+    '''Shifts the bits of each integer element to the right.
+    Only integer arrays are handled
+    .. seealso:: :data:`numpy.right_shift`
+    ''')
+# Variables to expose to Python
+# (cythonized data cannot be exposed to Python, even with cpdef.)
+bitwise_and = _bitwise_and
+bitwise_or = _bitwise_or
+bitwise_xor = _bitwise_xor
+invert = _invert
+left_shift = _left_shift
+right_shift = _right_shift
--- a/cupy/_core/_routines_indexing.pxd
+++ b/cupy/_core/_routines_indexing.pxd
+from cupy._core.core cimport _ndarray_base
+cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self)
+cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices)
+cdef _ndarray_setitem(_ndarray_base self, slices, value)
+cdef tuple _ndarray_nonzero(_ndarray_base self)
+cdef _scatter_op(_ndarray_base a, slices, value, op)
+cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out)
+cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode)
+cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode)
+cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out)
+cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2)
+cdef _ndarray_base _add_reduceat(
+    _ndarray_base array, indices, axis, dtype, out)