clone code from github

deb763b7 · root · 93bf084b · deb763b7 · deb763b7 · deb763b7
Commit deb763b7 authored Feb 06, 2026 by root
20 changed files
--- a/cupy/_core/_routines_indexing.pyx
+++ b/cupy/_core/_routines_indexing.pyx
+# distutils: language = c++
+import warnings
+import string
+
+import numpy
+
+import cupy
+import cupy._core.core as core
+from cupy._core._kernel import ElementwiseKernel, _get_warpsize
+from cupy._core._ufuncs import elementwise_copy
+
+from libcpp cimport vector
+
+from cupy._core._carray cimport shape_t
+from cupy._core._carray cimport strides_t
+from cupy._core cimport core
+from cupy._core cimport _routines_math as _math
+from cupy._core cimport _routines_manipulation as _manipulation
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport internal
+
+
+# _ndarray_base members
+
+
+cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices):
+    cdef Py_ssize_t axis
+    cdef list slice_list
+    cdef _ndarray_base a
+
+    slice_list = _prepare_slice_list(slices)
+    a, adv = _view_getitem(self, slice_list)
+    if adv is None:
+        return a
+
+    axis = adv
+    if len(slice_list) == 1:
+        s = slice_list[0]
+        if s.dtype.kind == 'b':
+            return _getitem_mask_single(a, s, axis)
+        else:
+            return a.take(s, axis)
+
+    return _getitem_multiple(a, axis, slice_list)
+
+
+cdef _ndarray_setitem(_ndarray_base self, slices, value):
+    if isinstance(value, _ndarray_base):
+        value = _squeeze_leading_unit_dims(value)
+    _scatter_op(self, slices, value, 'update')
+
+
+cdef tuple _ndarray_nonzero(_ndarray_base self):
+    cdef int ndim
+    cdef _ndarray_base dst = _ndarray_argwhere(self)
+    ndim = self.ndim
+    if ndim >= 1:
+        return tuple([dst[:, i] for i in range(ndim)])
+    else:
+        warnings.warn(
+            'calling nonzero on 0d arrays is deprecated',
+            DeprecationWarning)
+        return cupy.zeros(dst.shape[0], numpy.int64),
+
+
+# TODO(kataoka): Rename the function because `_ndarray_base` does not have
+# `argwhere` method
+cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self):
+    cdef Py_ssize_t count_nonzero
+    cdef int ndim
+    cdef _ndarray_base nonzero
+    numpy_int64 = numpy.int64
+    if self.size == 0:
+        count_nonzero = 0
+    else:
+        if self.dtype == numpy.bool_:
+            nonzero = self.ravel()
+        else:
+            nonzero = cupy._core.not_equal(self, 0)
+            nonzero = nonzero.ravel()
+
+        # Get number of True in the mask to determine the shape of the array
+        # after masking.
+        if nonzero.size <= 2 ** 31 - 1:
+            scan_dtype = numpy.int32
+        else:
+            scan_dtype = numpy_int64
+
+        chunk_size = 512
+
+        # TODO(anaruse): Use Optuna to automatically tune the threshold
+        # that determines whether "incomplete scan" is enabled or not.
+        # Basically, "incomplete scan" is fast when the array size is large,
+        # but for small arrays, it is better to use the normal method.
+        incomplete_scan = nonzero.size > chunk_size
+
+        scan_index = _math.scan(
+            nonzero, op=_math.scan_op.SCAN_SUM, dtype=scan_dtype, out=None,
+            incomplete=incomplete_scan, chunk_size=chunk_size)
+        count_nonzero = int(scan_index[-1])  # synchronize!
+
+    ndim = self._shape.size()
+    dst = core.ndarray((count_nonzero, ndim), dtype=numpy_int64)
+    if dst.size == 0:
+        return dst
+
+    nonzero.shape = self.shape
+    if incomplete_scan:
+        warp_size = _get_warpsize()
+        size = scan_index.size * chunk_size
+        _nonzero_kernel_incomplete_scan(chunk_size, warp_size)(
+            nonzero, scan_index, dst,
+            size=size, block_size=chunk_size)
+    else:
+        scan_index.shape = self.shape
+        _nonzero_kernel(nonzero, scan_index, dst)
+
+    return dst
+
+
+cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out):
+    cdef Py_ssize_t ndim = self._shape.size()
+    if axis is None:
+        return _take(self, indices, 0, ndim, out)
+    elif ndim == 0:
+        # check axis after atleast_1d
+        internal._normalize_axis_index(axis, 1)
+        return _take(self, indices, 0, 0, out)
+    else:
+        axis = internal._normalize_axis_index(axis, ndim)
+        return _take(self, indices, axis, axis + 1, out)
+
+
+cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode):
+    if mode not in ('raise', 'wrap', 'clip'):
+        raise ValueError('clipmode not understood')
+
+    n = self.size
+    if not isinstance(indices, _ndarray_base):
+        indices = core.array(indices)
+    indices = indices.ravel()
+
+    if not isinstance(values, _ndarray_base):
+        values = core.array(values, dtype=self.dtype)
+    if values.size == 0:
+        return
+
+    if mode == 'raise':
+        err = cupy.zeros((), dtype=numpy.bool_)
+        _put_raise_kernel(indices, values, values.size, n, self, err)
+        if err:
+            raise IndexError('invalid entry in indices array')
+    elif mode == 'wrap':
+        _put_wrap_kernel(indices, values, values.size, n, self)
+    elif mode == 'clip':
+        _put_clip_kernel(indices, values, values.size, n, self)
+
+
+cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode):
+    a = self
+    n = choices.shape[0]
+
+    # broadcast `a` and `choices[i]` for all i
+    if a.ndim < choices.ndim - 1:
+        for i in range(choices.ndim - 1 - a.ndim):
+            a = a[None, ...]
+    elif a.ndim > choices.ndim - 1:
+        for i in range(a.ndim + 1 - choices.ndim):
+            choices = choices[:, None, ...]
+    ba, bcs = _manipulation.broadcast(a, choices).values
+
+    if out is None:
+        out = core.ndarray(ba.shape[1:], choices.dtype)
+
+    n_channel = numpy.prod(bcs[0].shape)
+    if mode == 'raise':
+        if not ((a < n).all() and (0 <= a).all()):
+            raise ValueError('invalid entry in choice array')
+        _choose_kernel(ba[0], bcs, n_channel, out)
+    elif mode == 'wrap':
+        ba = ba[0] % n
+        _choose_kernel(ba, bcs, n_channel, out)
+    elif mode == 'clip':
+        _choose_clip_kernel(ba[0], bcs, n_channel, n, out)
+    else:
+        raise ValueError('clipmode not understood')
+
+    return out
+
+
+cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out):
+    a = self
+
+    if numpy.isscalar(condition):
+        raise ValueError('condition must be a 1-d array')
+
+    if not isinstance(condition, _ndarray_base):
+        condition = core.array(condition, dtype=int)
+        if condition.ndim != 1:
+            raise ValueError('condition must be a 1-d array')
+
+    # do not test condition.shape
+    res = _ndarray_nonzero(condition)  # synchronize
+
+    # the `take` method/function also make the input atleast_1d
+    return _ndarray_take(a, res[0], axis, out)
+
+
+cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2):
+    return _diagonal(self, offset, axis1, axis2)
+
+
+# private/internal
+
+
+cdef _ndarray_base _squeeze_leading_unit_dims(_ndarray_base src):
+    # remove leading 1s from the shape greedily.
+    # TODO(kataoka): compute requested ndim and do not remove too much for
+    # printing correct shape in error message.
+    cdef Py_ssize_t i
+    for i in range(src.ndim):
+        if src._shape[i] != 1:
+            break
+    else:
+        i = src.ndim
+
+    if i == 0:
+        return src
+
+    src = src.view()
+    # del src._shape[:i]
+    # del src._strides[:i]
+    src._shape.erase(src._shape.begin(), src._shape.begin()+i)
+    src._strides.erase(src._strides.begin(), src._strides.begin()+i)
+    return src
+
+
+cpdef list _prepare_slice_list(slices):
+    cdef Py_ssize_t i
+    cdef list slice_list
+    cdef bint fix_empty_dtype
+
+    if isinstance(slices, tuple):
+        slice_list = list(slices)
+    else:
+        slice_list = [slices]
+
+    # Convert list/NumPy/CUDA-Array-Interface arrays to cupy.ndarray.
+    # - Scalar int in indices returns a view.
+    # - Other array-like (including ()-shaped array) in indices forces to
+    #   return a new array.
+    for i, s in enumerate(slice_list):
+        if s is None or s is Ellipsis or isinstance(s, (slice, _ndarray_base)):
+            continue
+
+        fix_empty_dtype = False
+        if isinstance(s, (list, tuple)):
+            # This condition looks inaccurate, but so is NumPy.
+            # a[1, [np.empty(0, float)]] is allowed, while
+            # a[1, np.empty((1, 0), float)] raises IndexError.
+            fix_empty_dtype = True
+        elif numpy.isscalar(s):
+            if not isinstance(s, (bool, numpy.bool_)):
+                # keep scalar int
+                continue
+
+        if cupy.min_scalar_type(s).char == 'O':
+            raise IndexError(
+                'arrays used as indices must be of integer (or boolean) type')
+        try:
+            s = core.array(s, dtype=None, copy=False)
+        except ValueError:
+            # "Unsupported dtype"
+            raise IndexError(
+                'only integers, slices (`:`), ellipsis (`...`),'
+                'numpy.newaxis (`None`) and integer or '
+                'boolean arrays are valid indices')
+        if fix_empty_dtype and s.size == 0:
+            # An empty list means empty indices, not empty mask.
+            # Fix default dtype (float64).
+            s = s.astype(numpy.int32)
+        slice_list[i] = s
+
+    return slice_list
+
+
+cdef tuple _view_getitem(_ndarray_base a, list slice_list):
+    # Process scalar/slice/ellipsis indices
+    # Returns a 2-tuple
+    # - [0] (ndarray): view of a
+    # - [1] (int or None): start axis for remaining indices
+    # slice_list will be overwritten.
+    #     input should contain:
+    #         None, Ellipsis, slice (start:stop:step), scalar int, or
+    #         cupy.ndarray
+    #     output will contain:
+    #         cupy.ndarray
+    cdef shape_t shape
+    cdef strides_t strides
+    cdef _ndarray_base v
+    cdef Py_ssize_t ndim_a, axis_a, ndim_v, axis_v, ndim_ellipsis
+    cdef Py_ssize_t i, k, offset
+    cdef Py_ssize_t s_start, s_stop, s_step, dim, ind
+    cdef slice ss
+    cdef list index_list, axes
+    cdef vector.vector[bint] array_like_flags
+    cdef vector.vector[Py_ssize_t] array_ndims
+    cdef bint has_ellipsis, flag
+    cdef char kind
+
+    axis_a = 0
+    has_ellipsis = False
+    for s in slice_list:
+        if s is None:
+            continue
+        elif s is Ellipsis:
+            if has_ellipsis:
+                raise IndexError(
+                    "an index can only have a single ellipsis ('...')")
+            has_ellipsis = True
+        elif isinstance(s, _ndarray_base):
+            kind = ord(s.dtype.kind)
+            if kind == b'b':
+                k = s.ndim
+            elif kind == b'i' or kind == b'u':
+                k = 1
+            else:
+                raise IndexError(
+                    'arrays used as indices must be of integer or boolean '
+                    'type. (actual: {})'.format(s.dtype.type))
+            array_ndims.push_back(k)
+            axis_a += k
+        else:
+            # isinstance(s, slice) or numpy.isscalar(s)
+            axis_a += 1
+    if not has_ellipsis:
+        slice_list.append(Ellipsis)
+
+    ndim_a = a._shape.size()
+    if axis_a > ndim_a:
+        raise IndexError(
+            'too many indices for array: '
+            f'array is {ndim_a}-dimensional, but {axis_a} were indexed')
+    ndim_ellipsis = ndim_a - axis_a
+
+    # Create new shape and stride
+    i = 0
+    axis_a = 0
+    axis_v = 0
+    offset = 0
+    # index_list: remaining indices to be processed.
+    # Each elem is a 3-tuple (array, axis_start, axis_count)
+    index_list = []
+    for s in slice_list:
+        if s is None:
+            shape.push_back(1)
+            strides.push_back(0)
+            axis_v += 1
+            array_like_flags.push_back(False)
+        elif isinstance(s, _ndarray_base):
+            k = array_ndims[i]
+            index_list.append((s, axis_v, k))
+            i += 1
+            kind = ord(s.dtype.kind)
+            if kind == b'b':
+                _check_mask_shape(a, s, axis_a)
+            for _ in range(k):
+                shape.push_back(a._shape[axis_a])
+                strides.push_back(a._strides[axis_a])
+                axis_a += 1
+            axis_v += k
+            array_like_flags.push_back(True)
+        elif s is Ellipsis:
+            for _ in range(ndim_ellipsis):
+                shape.push_back(a._shape[axis_a])
+                strides.push_back(a._strides[axis_a])
+                axis_a += 1
+            axis_v += ndim_ellipsis
+            array_like_flags.push_back(False)
+        elif isinstance(s, slice):
+            ss = internal.complete_slice(s, a._shape[axis_a])
+            s_start = ss.start
+            s_stop = ss.stop
+            s_step = ss.step
+            if s_step > 0:
+                dim = (s_stop - s_start - 1) // s_step + 1
+            else:
+                dim = (s_stop - s_start + 1) // s_step + 1
+
+            if dim == 0:
+                strides.push_back(a._strides[axis_a])
+            else:
+                strides.push_back(a._strides[axis_a] * s_step)
+
+            if s_start > 0:
+                offset += a._strides[axis_a] * s_start
+            shape.push_back(dim)
+            axis_a += 1
+            axis_v += 1
+            array_like_flags.push_back(False)
+        else:
+            # numpy.isscalar(s)
+            ind = int(s)
+            if ind < 0:
+                ind += a._shape[axis_a]
+            if not (0 <= ind < a._shape[axis_a]):
+                msg = ('Index %s is out of bounds for axis %s with '
+                       'size %s' % (s, axis_a, a._shape[axis_a]))
+                raise IndexError(msg)
+            offset += ind * a._strides[axis_a]
+            axis_a += 1
+            # array-like but not array
+            array_like_flags.push_back(True)
+
+    ndim_v = axis_v
+    v = a.view()
+    if a.size != 0:
+        v.data = a.data + offset
+    v._set_shape_and_strides(shape, strides, True, True)
+
+    if array_ndims.empty():
+        # no advanced indexing. no mask.
+        del slice_list[:]
+        return v, None
+
+    slice_list[:] = [s for s, _, _ in index_list]
+
+    # non-consecutive array-like indices => batch dims go first in output
+    # consecutive array-like indices => start batch dims there
+    k = 0
+    for i, flag in enumerate(array_like_flags):
+        if k == 0:
+            if flag:
+                k = 1
+        elif k == 1:
+            if not flag:
+                k = 2
+        else:  # k == 2
+            if flag:
+                break
+    else:
+        return v, index_list[0][1]
+
+    # compute transpose arg
+    axes = []
+    for _, axis_v, k in index_list:
+        for _ in range(k):
+            axes.append(axis_v)
+            axis_v += 1
+    axes.extend([dim for dim in range(ndim_v) if dim not in axes])
+    v = _manipulation._transpose(v, axes)
+    return v, 0
+
+
+@cupy._util.memoize(for_each_device=True)
+def _nonzero_kernel_incomplete_scan(block_size, warp_size=32):
+    in_params = 'raw T a, raw S b'
+    out_params = 'raw O dst'
+    loop_prep = string.Template("""
+        __shared__ S smem[${warp_size}];
+        const int n_warp = ${block_size} / ${warp_size};
+        const int warp_id = threadIdx.x / ${warp_size};
+        const int lane_id = threadIdx.x % ${warp_size};
+    """).substitute(block_size=block_size, warp_size=warp_size)
+    loop_body = string.Template("""
+        S x = 0;
+        if (i < a.size()) x = a[i];
+        for (int j = 1; j < ${warp_size}; j *= 2) {
+            S tmp = __shfl_up_sync(0xffffffff, x, j, ${warp_size});
+            if (lane_id - j >= 0) x += tmp;
+        }
+        if (lane_id == ${warp_size} - 1) smem[warp_id] = x;
+        __syncthreads();
+        if (warp_id == 0) {
+            S y = 0;
+            if (lane_id < n_warp) y = smem[lane_id];
+            for (int j = 1; j < n_warp; j *= 2) {
+                S tmp = __shfl_up_sync(0xffffffff, y, j, ${warp_size});
+                if (lane_id - j >= 0) y += tmp;
+            }
+            int block_id = i / ${block_size};
+            S base = 0;
+            if (block_id > 0) base = b[block_id - 1];
+            if (lane_id == ${warp_size} - 1) y = 0;
+            smem[(lane_id + 1) % ${warp_size}] = y + base;
+        }
+        __syncthreads();
+        x += smem[warp_id];
+        S x0 = __shfl_up_sync(0xffffffff, x, 1, ${warp_size});
+        if (lane_id == 0) {
+            x0 = smem[warp_id];
+        }
+        if (x0 < x && i < a.size()) {
+            O j = i;
+            for (int d = a.ndim - 1; d >= 0; d--) {
+                ptrdiff_t ind[] = {x0, d};
+                O j_next = j / a.shape()[d];
+                dst[ind] = j - j_next * a.shape()[d];
+                j = j_next;
+            }
+        }
+    """).substitute(block_size=block_size, warp_size=warp_size)
+    return cupy.ElementwiseKernel(in_params, out_params, loop_body,
+                                  'cupy_nonzero_kernel_incomplete_scan',
+                                  loop_prep=loop_prep)
+
+
+_nonzero_kernel = ElementwiseKernel(
+    'T src, S index', 'raw U dst',
+    '''
+    if (src != 0){
+        for(int j = 0; j < _ind.ndim; j++){
+            ptrdiff_t ind[] = {index - 1, j};
+            dst[ind] = _ind.get()[j];
+        }
+    }''',
+    'cupy_nonzero_kernel',
+    reduce_dims=False)
+
+
+_take_kernel_core = '''
+ptrdiff_t out_i = indices % index_range;
+if (out_i < 0) out_i += index_range;
+if (ldim != 1) out_i += (i / (cdim * rdim)) * index_range;
+if (rdim != 1) out_i = out_i * rdim + i % rdim;
+out = a[out_i];
+'''
+
+
+_take_kernel = ElementwiseKernel(
+    'raw T a, S indices, uint32 ldim, uint32 cdim, uint32 rdim, '
+    'int64 index_range',
+    'T out', _take_kernel_core, 'cupy_take')
+
+
+_take_kernel_scalar = ElementwiseKernel(
+    'raw T a, int64 indices, uint32 ldim, uint32 cdim, uint32 rdim, '
+    'int64 index_range',
+    'T out', _take_kernel_core, 'cupy_take_scalar')
+
+
+_choose_kernel = ElementwiseKernel(
+    'S a, raw T choices, int32 n_channel',
+    'T y',
+    'y = choices[i + n_channel * a]',
+    'cupy_choose')
+
+
+_choose_clip_kernel = ElementwiseKernel(
+    'S a, raw T choices, int32 n_channel, int32 n',
+    'T y',
+    '''
+      S x = a;
+      if (a < 0) {
+        x = 0;
+      } else if (a >= n) {
+        x = n - 1;
+      }
+      y = choices[i + n_channel * x];
+    ''',
+    'cupy_choose_clip')
+
+
+cdef _put_raise_kernel = ElementwiseKernel(
+    'S ind, raw T vals, int64 n_vals, int64 n',
+    'raw U data, raw bool err',
+    '''
+      ptrdiff_t ind_ = ind;
+      if (!(-n <= ind_ && ind_ < n)) {
+        err[0] = 1;
+      } else {
+        if (ind_ < 0) ind_ += n;
+        data[ind_] = (U)(vals[i % n_vals]);
+      }
+    ''',
+    'cupy_put_raise')
+
+
+cdef _put_wrap_kernel = ElementwiseKernel(
+    'S ind, raw T vals, int64 n_vals, int64 n',
+    'raw U data',
+    '''
+      ptrdiff_t ind_ = ind;
+      ind_ %= n;
+      if (ind_ < 0) ind_ += n;
+      data[ind_] = (U)(vals[i % n_vals]);
+    ''',
+    'cupy_put_wrap')
+
+
+cdef _put_clip_kernel = ElementwiseKernel(
+    'S ind, raw T vals, int64 n_vals, int64 n',
+    'raw U data',
+    '''
+      ptrdiff_t ind_ = ind;
+      if (ind_ < 0) {
+        ind_ = 0;
+      } else if (ind_ >= n) {
+        ind_ = n - 1;
+      }
+      data[ind_] = (U)(vals[i % n_vals]);
+    ''',
+    'cupy_put_clip')
+
+
+cdef _create_scatter_kernel(name, code):
+    return ElementwiseKernel(
+        'T v, S indices, int32 cdim, int32 rdim, int32 adim',
+        'raw T a',
+        string.Template('''
+            S wrap_indices = indices % adim;
+            if (wrap_indices < 0) wrap_indices += adim;
+            ptrdiff_t li = i / (rdim * cdim);
+            ptrdiff_t ri = i % rdim;
+            T &out0 = a[(li * adim + wrap_indices) * rdim + ri];
+            T &in0 = out0;
+            const T &in1 = v;
+            ${code};
+        ''').substitute(code=code),
+        name,
+    )
+
+
+cdef _scatter_update_kernel = _create_scatter_kernel(
+    'cupy_scatter_update', 'out0 = in1')
+
+cdef _scatter_add_kernel = _create_scatter_kernel(
+    'cupy_scatter_add', 'atomicAdd(&out0, in1)')
+
+cdef _scatter_sub_kernel = _create_scatter_kernel(
+    'cupy_scatter_sub', 'atomicSub(&out0, in1)')
+
+cdef _scatter_max_kernel = _create_scatter_kernel(
+    'cupy_scatter_max', 'atomicMax(&out0, in1)')
+
+cdef _scatter_min_kernel = _create_scatter_kernel(
+    'cupy_scatter_min', 'atomicMin(&out0, in1)')
+
+cdef _scatter_and_kernel = _create_scatter_kernel(
+    'cupy_scatter_and', 'atomicAnd(&out0, in1)')
+
+cdef _scatter_or_kernel = _create_scatter_kernel(
+    'cupy_scatter_or', 'atomicOr(&out0, in1)')
+
+cdef _scatter_xor_kernel = _create_scatter_kernel(
+    'cupy_scatter_xor', 'atomicXor(&out0, in1)')
+
+
+cdef _create_scatter_mask_kernel(name, code):
+    return ElementwiseKernel(
+        'raw T v, bool mask, S mask_scanned',
+        'T a',
+        string.Template('''
+            T &out0 = a;
+            T &in0 = a;
+            const T &in1 = v[mask_scanned - 1];
+            if (mask) ${code};
+        ''').substitute(code=code),
+        name,
+    )
+
+
+cdef _scatter_update_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_update_mask', 'out0 = in1')
+
+cdef _scatter_add_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_add_mask', 'out0 = in0 + in1')
+
+cdef _scatter_sub_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_add_mask', 'out0 = in0 - in1')
+
+cdef _scatter_max_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_max_mask', 'out0 = max(in0, in1)')
+
+cdef _scatter_min_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_min_mask', 'out0 = min(in0, in1)')
+
+cdef _scatter_and_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_and_mask', 'out0 = (in0 & in1)')
+
+cdef _scatter_or_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_or_mask', 'out0 = (in0 | in1)')
+
+cdef _scatter_xor_mask_kernel = _create_scatter_mask_kernel(
+    'cupy_scatter_xor_mask', 'out0 = (in0 ^ in1)')
+
+
+_getitem_mask_kernel = ElementwiseKernel(
+    'T a, bool mask, S mask_scanned',
+    'raw T out',
+    'if (mask) out[mask_scanned - 1] = a',
+    'cupy_getitem_mask')
+
+
+cdef _check_mask_shape(_ndarray_base a, _ndarray_base mask, Py_ssize_t axis):
+    cdef Py_ssize_t i, a_sh, m_sh
+    for i, m_sh in enumerate(mask._shape):
+        a_sh = a._shape[axis + i]
+        if m_sh not in (0, a_sh):
+            raise IndexError(
+                'boolean index did not match indexed array along dimension '
+                f'{axis + i}; dimension is {a_sh} '
+                f'but corresponding boolean dimension is {m_sh}'
+            )
+
+
+cpdef _prepare_mask_indexing_single(
+        _ndarray_base a, _ndarray_base mask, Py_ssize_t axis):
+    cdef _ndarray_base mask_scanned, mask_br
+    cdef int n_true
+    cdef tuple lshape, rshape, a_shape
+    cdef Py_ssize_t a_ndim, mask_ndim
+
+    a_ndim = a._shape.size()
+    mask_ndim = mask._shape.size()
+    a_shape = a.shape
+    lshape = a_shape[:axis]
+    rshape = a_shape[axis + mask._shape.size():]
+
+    if mask.size == 0:
+        masked_shape = lshape + (0,) + rshape
+        mask_br = _manipulation._reshape(mask, masked_shape)
+        return mask_br, mask_br, masked_shape
+
+    # Get number of True in the mask to determine the shape of the array
+    # after masking.
+    if mask.size <= 2 ** 31 - 1:
+        mask_type = numpy.int32
+    else:
+        mask_type = numpy.int64
+    op = _math.scan_op.SCAN_SUM
+
+    # starts with 1
+    mask_scanned = _math.scan(mask.ravel(), op=op, dtype=mask_type)
+    n_true = int(mask_scanned[-1])
+    masked_shape = lshape + (n_true,) + rshape
+    # When mask covers the entire array, broadcasting is not necessary.
+    if mask_ndim == a_ndim and axis == 0:
+        return (
+            mask,
+            _manipulation._reshape(mask_scanned, mask._shape),
+            masked_shape)
+    mask_scanned = None
+
+    # The scan of the broadcasted array is used to index on kernel.
+    mask = _manipulation._reshape(
+        mask,
+        axis * (1,) + mask.shape + (a_ndim - axis - mask_ndim) * (1,))
+    if <Py_ssize_t>mask._shape.size() > a_ndim:
+        raise IndexError('too many indices for array')
+
+    mask = _manipulation.broadcast_to(mask, a_shape)
+    if mask.size <= 2 ** 31 - 1:
+        mask_type = numpy.int32
+    else:
+        mask_type = numpy.int64
+    mask_scanned = _manipulation._reshape(
+        _math.scan(mask.ravel(), op=_math.scan_op.SCAN_SUM, dtype=mask_type),
+        mask._shape)
+    return mask, mask_scanned, masked_shape
+
+
+cpdef _ndarray_base _getitem_mask_single(
+        _ndarray_base a, _ndarray_base mask, int axis):
+    cdef _ndarray_base mask_scanned
+    cdef tuple masked_shape
+
+    mask, mask_scanned, masked_shape = _prepare_mask_indexing_single(
+        a, mask, axis)
+    out = core.ndarray(masked_shape, dtype=a.dtype)
+    if out.size == 0:
+        return out
+    return _getitem_mask_kernel(a, mask, mask_scanned, out)
+
+
+cdef _ndarray_base _take(
+        _ndarray_base a, indices, int start, int stop, _ndarray_base out=None):
+    # Take along (flattened) axes from start to stop.
+    # When start + 1 == stop this function behaves similarly to np.take
+    cdef tuple out_shape, indices_shape
+    cdef int i, ndim = a._shape.size()
+    cdef Py_ssize_t ldim, cdim, rdim, index_range
+
+    assert start <= stop
+
+    if numpy.isscalar(indices):
+        indices_shape = ()
+        cdim = 1
+    else:
+        if not isinstance(indices, _ndarray_base):
+            indices = core.array(indices, dtype=int)
+        indices_shape = indices.shape
+        cdim = indices.size
+
+    ldim = rdim = 1
+    if start == 0 and stop == ndim:
+        out_shape = indices_shape
+        index_range = a.size
+    else:
+        a_shape = a.shape
+        out_shape = a_shape[:start] + indices_shape + a_shape[stop:]
+        if len(indices_shape) != 0:
+            indices = _manipulation._reshape(
+                indices,
+                (1,) * start + indices_shape + (1,) * (ndim - stop))
+        for i in range(start):
+            ldim *= a._shape[i]
+        for i in range(stop, ndim):
+            rdim *= a._shape[i]
+        index_range = 1
+        for i in range(start, stop):
+            index_range *= a._shape[i]
+
+    if out is None:
+        out = core.ndarray(out_shape, dtype=a.dtype)
+    else:
+        if out.dtype != a.dtype:
+            raise TypeError('Output dtype mismatch')
+        if out.shape != out_shape:
+            raise ValueError('Output shape mismatch')
+    if a.size == 0 and out.size != 0:
+        raise IndexError('cannot do a non-empty take from an empty axes.')
+
+    if isinstance(indices, _ndarray_base):
+        return _take_kernel(
+            a.reduced_view(), indices, ldim, cdim, rdim, index_range, out)
+    else:
+        return _take_kernel_scalar(
+            a.reduced_view(), indices, ldim, cdim, rdim, index_range, out)
+
+
+cdef _scatter_op_single(
+        _ndarray_base a, _ndarray_base indices, value, Py_ssize_t start,
+        Py_ssize_t stop, op=''):
+    # When op == 'update', this function behaves similarly to
+    # a code below using NumPy under the condition that a = a._reshape(shape)
+    # does not invoke copy.
+    #
+    # shape = a[:start] +\
+    #     (numpy.prod(a[start:stop]),) + a[stop:]
+    # a = a._reshape(shape)
+    # slices = (slice(None),) * start + indices +\
+    #     (slice(None),) * (a.ndim - stop)
+    # a[slices] = value
+    cdef Py_ssize_t adim, cdim, rdim
+    cdef tuple a_shape, indices_shape, lshape, rshape, v_shape
+    cdef _ndarray_base v
+
+    if not isinstance(value, _ndarray_base):
+        v = core.array(value, dtype=a.dtype)
+    else:
+        v = value.astype(a.dtype, copy=False)
+
+    a_shape = a.shape
+
+    lshape = a_shape[:start]
+    rshape = a_shape[stop:]
+    adim = internal.prod_sequence(a_shape[start:stop])
+
+    indices_shape = indices.shape
+    v_shape = lshape + indices_shape + rshape
+    v = _manipulation.broadcast_to(v, v_shape)
+
+    cdim = indices.size
+    rdim = internal.prod_sequence(rshape)
+    indices = _manipulation._reshape(
+        indices,
+        (1,) * len(lshape) + indices_shape + (1,) * len(rshape))
+    indices = _manipulation.broadcast_to(indices, v_shape)
+
+    if op == 'update':
+        _scatter_update_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    elif op == 'add':
+        # There is constraints on types because atomicAdd() in CUDA 7.5
+        # only supports int32, uint32, uint64, and float32.
+        if not issubclass(v.dtype.type,
+                          (numpy.int32, numpy.float16, numpy.float32,
+                           numpy.float64, numpy.uint32, numpy.uint64,
+                           numpy.intc, numpy.uintc, numpy.ulonglong)):
+            raise TypeError(
+                'cupy.add.at only supports int32, float16, float32, float64, '
+                'uint32, uint64, as data type')
+        _scatter_add_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    elif op == 'sub':
+        if not issubclass(v.dtype.type,
+                          (numpy.int32, numpy.uint32,
+                           numpy.intc, numpy.uintc)):
+            raise TypeError(
+                'cupy.subtract.at only supports int32, uint32, as data type')
+        _scatter_sub_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    elif op == 'max':
+        if not issubclass(v.dtype.type,
+                          (numpy.int32, numpy.float32, numpy.float64,
+                           numpy.uint32, numpy.uint64,
+                           numpy.intc, numpy.uintc, numpy.ulonglong)):
+            raise TypeError(
+                'cupy.maximum.at only supports int32, float32, float64, '
+                'uint32, uint64 as data type')
+        _scatter_max_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    elif op == 'min':
+        if not issubclass(v.dtype.type,
+                          (numpy.int32, numpy.float32, numpy.float64,
+                           numpy.uint32, numpy.uint64,
+                           numpy.intc, numpy.uintc, numpy.ulonglong)):
+            raise TypeError(
+                'cupy.minimum.at only supports int32, float32, float64, '
+                'uint32, uint64 as data type')
+        _scatter_min_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    elif op == 'and':
+        if not issubclass(v.dtype.type,
+                          (numpy.int32, numpy.int64,
+                           numpy.uint32, numpy.uint64,
+                           numpy.intc, numpy.uintc,
+                           numpy.longlong, numpy.ulonglong)):
+            raise TypeError(
+                'cupy.bitwise_and.at only supports int32, int64, '
+                'uint32, uint64 as data type')
+        _scatter_and_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    elif op == 'or':
+        if not issubclass(v.dtype.type,
+                          (numpy.int32, numpy.int64,
+                           numpy.uint32, numpy.uint64,
+                           numpy.intc, numpy.uintc,
+                           numpy.longlong, numpy.ulonglong)):
+            raise TypeError(
+                'cupy.bitwise_or.at only supports int32, int64, '
+                'uint32, uint64 as data type')
+        _scatter_or_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    elif op == 'xor':
+        if not issubclass(v.dtype.type,
+                          (numpy.int32, numpy.int64,
+                           numpy.uint32, numpy.uint64,
+                           numpy.intc, numpy.uintc,
+                           numpy.longlong, numpy.ulonglong)):
+            raise TypeError(
+                'cupy.bitwise_xor.at only supports int32, int64, '
+                'uint32, uint64 as data type')
+        _scatter_xor_kernel(
+            v, indices, cdim, rdim, adim, a.reduced_view())
+    else:
+        raise ValueError('provided op is not supported')
+
+
+cdef _scatter_op_mask_single(
+        _ndarray_base a, _ndarray_base mask, v, Py_ssize_t axis, op):
+    cdef _ndarray_base mask_scanned, src
+    cdef tuple masked_shape
+
+    mask, mask_scanned, masked_shape = _prepare_mask_indexing_single(
+        a, mask, axis)
+    if internal.prod(masked_shape) == 0:
+        return
+
+    if not isinstance(v, _ndarray_base):
+        src = core.array(v, dtype=a.dtype)
+    else:
+        src = v
+        # Cython's static resolution does not work because of omitted arguments
+        src = (<object>src).astype(a.dtype, copy=False)
+    # broadcast src to shape determined by the mask
+    src = _manipulation.broadcast_to(src, masked_shape)
+
+    if op == 'update':
+        _scatter_update_mask_kernel(src, mask, mask_scanned, a)
+    elif op == 'add':
+        _scatter_add_mask_kernel(src, mask, mask_scanned, a)
+    elif op == 'sub':
+        _scatter_sub_mask_kernel(src, mask, mask_scanned, a)
+    elif op == 'max':
+        _scatter_max_mask_kernel(src, mask, mask_scanned, a)
+    elif op == 'min':
+        _scatter_min_mask_kernel(src, mask, mask_scanned, a)
+    elif op == 'and':
+        _scatter_and_mask_kernel(src, mask, mask_scanned, a)
+    elif op == 'or':
+        _scatter_or_mask_kernel(src, mask, mask_scanned, a)
+    elif op == 'xor':
+        _scatter_xor_mask_kernel(src, mask, mask_scanned, a)
+    else:
+        raise ValueError('provided op is not supported')
+
+
+cdef _scatter_op(_ndarray_base a, slices, value, op):
+    cdef Py_ssize_t start, stop, axis
+    cdef _ndarray_base x, y, reduced_idx
+    cdef list slice_list
+
+    slice_list = _prepare_slice_list(slices)
+    a, adv = _view_getitem(a, slice_list)
+    if adv is not None:
+        axis = adv
+        if len(slice_list) == 1:
+            s = slice_list[0]
+            if s.dtype.kind == 'b':
+                _scatter_op_mask_single(a, s, value, axis, op)
+            else:
+                _scatter_op_single(a, s, value, axis, axis + 1, op)
+        else:
+            # scatter_op with multiple integer arrays
+            reduced_idx, start, stop = _prepare_multiple_array_indexing(
+                a, axis, slice_list)
+            _scatter_op_single(a, reduced_idx, value, start, stop, op)
+        return
+
+    y = a
+
+    if op == 'update':
+        if not isinstance(value, _ndarray_base):
+            y.fill(value)
+            return
+        x = value
+        if (internal.vector_equal(y._shape, x._shape) and
+                internal.vector_equal(y._strides, x._strides)):
+            if y.data.ptr == x.data.ptr:
+                return  # Skip since x and y are the same array
+            elif y._c_contiguous and x.dtype == y.dtype:
+                y.data.copy_from_device_async(x.data, x.nbytes)
+                return
+        elementwise_copy(x, y)
+        return
+    if op == 'add':
+        _math._add(y, value, y)
+        return
+    if op == 'sub':
+        _math._subtract(y, value, y)
+        return
+    if op == 'max':
+        cupy.maximum(y, value, y)
+        return
+    if op == 'min':
+        cupy.minimum(y, value, y)
+        return
+    if op == 'and':
+        cupy.bitwise_and(y, value, y)
+        return
+    if op == 'or':
+        cupy.bitwise_or(y, value, y)
+        return
+    if op == 'xor':
+        cupy.bitwise_xor(y, value, y)
+        return
+    raise ValueError('this op is not supported')
+
+
+cdef _ndarray_base _diagonal(
+        _ndarray_base a, Py_ssize_t offset=0, Py_ssize_t axis1=0,
+        Py_ssize_t axis2=1):
+    cdef Py_ssize_t ndim = a.ndim
+    if not (-ndim <= axis1 < ndim and -ndim <= axis2 < ndim):
+        raise numpy.AxisError(
+            'axis1(={0}) and axis2(={1}) must be within range '
+            '(ndim={2})'.format(axis1, axis2, ndim))
+
+    axis1 %= ndim
+    axis2 %= ndim
+    if axis1 < axis2:
+        min_axis, max_axis = axis1, axis2
+    else:
+        min_axis, max_axis = axis2, axis1
+
+    tr = list(range(ndim))
+    del tr[max_axis]
+    del tr[min_axis]
+    if offset >= 0:
+        a = _manipulation._transpose(a, tr + [axis1, axis2])
+    else:
+        a = _manipulation._transpose(a, tr + [axis2, axis1])
+        offset = -offset
+
+    diag_size = max(0, min(a.shape[-2], a.shape[-1] - offset))
+    ret_shape = a.shape[:-2] + (diag_size,)
+    if diag_size == 0:
+        return core.ndarray(ret_shape, dtype=a.dtype)
+
+    a = a[..., :diag_size, offset:offset + diag_size]
+
+    ret = a.view()
+    # TODO(niboshi): Confirm update_x_contiguity flags
+    ret._set_shape_and_strides(
+        a.shape[:-2] + (diag_size,),
+        a.strides[:-2] + (a.strides[-1] + a.strides[-2],),
+        True, True)
+    return ret
+
+
+_prepare_array_indexing = ElementwiseKernel(
+    'T s, S len, S stride',
+    'S out',
+    'S in0 = s, in1 = len;'
+    'out += stride * (in0 - _floor_divide(in0, in1) * in1)',
+    'cupy_prepare_array_indexing')
+
+
+cdef tuple _prepare_multiple_array_indexing(
+    _ndarray_base a, Py_ssize_t start, list slices
+):
+    # slices consist of ndarray
+    cdef list indices = [], shapes = []  # int ndarrays
+    cdef Py_ssize_t i, stop, stride
+    cdef _ndarray_base reduced_idx, s
+
+    for s in slices:
+        if s.dtype.kind == 'b':
+            s = _ndarray_argwhere(s).T
+            indices.extend(s)
+            shapes.append(s.shape[1:])
+        else:
+            indices.append(s)
+            shapes.append(s.shape)
+
+    stop = start + len(indices)
+
+    # br = _manipulation.broadcast(*indices)
+    # indices = list(br.values)
+
+    reduced_idx = core.ndarray(
+        internal._broadcast_shapes(shapes), dtype=numpy.int64)
+    reduced_idx.fill(0)
+    stride = 1
+    i = stop
+    for s in reversed(indices):
+        i -= 1
+        a_shape_i = a._shape[i]
+        # wrap all out-of-bound indices
+        if a_shape_i != 0:
+            _prepare_array_indexing(s, a_shape_i, stride, reduced_idx)
+        stride *= a_shape_i
+
+    return reduced_idx, start, stop
+
+
+cdef _ndarray_base _getitem_multiple(
+        _ndarray_base a, Py_ssize_t start, list slices):
+    reduced_idx, start, stop = _prepare_multiple_array_indexing(
+        a, start, slices)
+    return _take(a, reduced_idx, start, stop)
+
+
+cdef _ndarray_base _add_reduceat(
+        _ndarray_base array, indices, axis, dtype, out):
+    from cupy._sorting import search
+    axis = internal._normalize_axis_index(axis, array.ndim)
+    indices = cupy.append(indices, array.shape[axis])
+    shape = [1 if i == axis else dim for i, dim in enumerate(array.shape)]
+    acc = array.cumsum(axis, dtype)
+    acc = cupy.append(cupy.zeros(shape, acc.dtype), acc, axis)
+    mask = indices[:-1] >= indices[1:]
+    mask = mask.reshape(-1, *([1] * (array.ndim - axis - 1)))
+    return search._where_ufunc(
+        mask,
+        array.take(indices[:-1], axis),
+        acc.take(indices[1:], axis) - acc.take(indices[:-1], axis),
+        out
+    )
--- a/cupy/_core/_routines_linalg.pxd
+++ b/cupy/_core/_routines_linalg.pxd
+from cupy._core._carray cimport shape_t
+from cupy._core.core cimport _ndarray_base
+
+
+cpdef compute_type_to_str(compute_type)
+
+cpdef get_compute_type(dtype)
+
+cpdef _ndarray_base dot(_ndarray_base a, _ndarray_base b, _ndarray_base out=*)
+
+cpdef _ndarray_base tensordot_core(
+    _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t n,
+    Py_ssize_t m, Py_ssize_t k, const shape_t& ret_shape)
+
+cpdef _ndarray_base matmul(
+    _ndarray_base a, _ndarray_base b, _ndarray_base out=*)
+
+
+cpdef enum:
+    COMPUTE_TYPE_TBD = 0
+    COMPUTE_TYPE_DEFAULT = 1   # default
+    COMPUTE_TYPE_PEDANTIC = 2  # disable algorithmic optimizations
+    COMPUTE_TYPE_FP16 = 3      # allow converting inputs to FP16
+    COMPUTE_TYPE_FP32 = 4      # allow converting inputs to FP32
+    COMPUTE_TYPE_FP64 = 5      # allow converting inputs to FP64
+    COMPUTE_TYPE_BF16 = 6      # allow converting inputs to BF16
+    COMPUTE_TYPE_TF32 = 7      # allow converting inputs to TF32
--- a/cupy/_core/_routines_linalg.pyx
+++ b/cupy/_core/_routines_linalg.pyx
+import math
+import os
+import warnings
+
+import cython
+import numpy
+
+import cupy
+from cupy._core._kernel import ElementwiseKernel
+from cupy._core._reduction import ReductionKernel
+from cupy._core._ufuncs import elementwise_copy
+import cupy._core.core as core
+
+
+from libc.stdint cimport intptr_t
+
+from cupy._core cimport _accelerator
+from cupy._core._carray cimport shape_t
+from cupy._core._dtype cimport to_cuda_dtype
+from cupy._core._scalar cimport get_typename
+from cupy._core.core cimport _internal_ascontiguousarray
+from cupy._core.core cimport _ndarray_init
+from cupy._core.core cimport ascontiguousarray
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport _memory_range
+from cupy._core cimport _routines_manipulation as _manipulation
+from cupy._core cimport _routines_math as _math
+from cupy.cuda cimport device
+from cupy_backends.cuda.api cimport runtime
+from cupy_backends.cuda.libs cimport cublas
+
+
+cdef extern from '../../cupy_backends/cupy_complex.h':
+    ctypedef struct cuComplex 'cuComplex':
+        float x, y
+
+    ctypedef struct cuDoubleComplex 'cuDoubleComplex':
+        double x, y
+
+
+cdef int _cuda_runtime_version = -1
+
+
+cdef list compute_types = [COMPUTE_TYPE_TBD,  # float16
+                           COMPUTE_TYPE_TBD,  # float32
+                           COMPUTE_TYPE_TBD]  # float64
+cdef dict compute_type_str = {
+    0: 'COMPUTE_TYPE_TBD',
+    1: 'COMPUTE_TYPE_DEFAULT',
+    2: 'COMPUTE_TYPE_PEDANTIC',
+    3: 'COMPUTE_TYPE_FP16',
+    4: 'COMPUTE_TYPE_FP32',
+    5: 'COMPUTE_TYPE_FP64',
+    6: 'COMPUTE_TYPE_BF16',
+    7: 'COMPUTE_TYPE_TF32',
+}
+
+
+cpdef int to_compute_type_index(dtype) except -1:
+    cdef str dtype_char = numpy.dtype(dtype).char
+    if dtype_char == 'e':
+        return 0
+    elif dtype_char in 'fF':
+        return 1
+    elif dtype_char in 'dD':
+        return 2
+    else:
+        raise TypeError('dtype is not supported: {}'.format(dtype))
+
+
+cpdef set_compute_type(dtype, compute_type):
+    global compute_types
+    if compute_type in (COMPUTE_TYPE_TBD, COMPUTE_TYPE_DEFAULT,
+                        COMPUTE_TYPE_PEDANTIC, COMPUTE_TYPE_FP16,
+                        COMPUTE_TYPE_FP32, COMPUTE_TYPE_FP64):
+        compute_types[to_compute_type_index(dtype)] = compute_type
+    elif compute_type in (COMPUTE_TYPE_BF16, COMPUTE_TYPE_TF32):
+        if int(device.get_compute_capability()) >= 80:
+            compute_types[to_compute_type_index(dtype)] = compute_type
+        else:
+            warnings.warn('COMPUTE_TYPE_BF16 and COMPUTE_TYPE_TF32 are only '
+                          'available on GPUs with compute capability 8.0 or '
+                          'higher. COMPUTE_TYPE_DEFAULT will be used instead.')
+            compute_types[to_compute_type_index(dtype)] = COMPUTE_TYPE_DEFAULT
+    else:
+        raise ValueError('Unknown compute type: {}'.format(compute_type))
+
+
+cpdef compute_type_to_str(compute_type):
+    if compute_type in compute_type_str:
+        return compute_type_str[compute_type]
+    else:
+        return compute_type
+
+
+def _tensordot_core_int_kernel_impl(config, dtype, code, name):
+    # This code is based in the GEMM implementation from MAGMA
+    # (http://icl.cs.utk.edu/magma/)
+    code = '''
+#define fetch(arr, col, m, n, bound) arr[min(n*col + m, bound)]
+
+template<typename T>
+__device__ void _tensordot_core_int_kernel_impl(
+        int M, int N, int K,
+        const T* A,
+        const T* B,
+        T * C)
+{
+    int idx = threadIdx.x;
+    int idy = threadIdx.y;
+
+    int idt = DIM_X * idy + idx;
+
+    int idxA = idt % DIM_XA;
+    int idyA = idt / DIM_XA;
+
+    int idxB = idt % DIM_XB;
+    int idyB = idt / DIM_XB;
+
+    int blx = blockIdx.x;
+    int bly = blockIdx.y;
+
+    __shared__ T sA[BLK_K][BLK_M + 1];
+    __shared__ T sB[BLK_N][BLK_K + 1];
+
+    // registers for the innermost loop
+    T rC[THR_N][THR_M];
+    T rA[THR_M];
+    T rB[THR_N];
+
+    T ra[BLK_K / DIM_YA][BLK_M / DIM_XA];
+    T rb[BLK_N / DIM_YB][BLK_K / DIM_XB];
+
+    const T* offs_dA = A + blx * BLK_M       + idyA * M + idxA;
+    int boundA = (M * (K - 1) + M) - (blx * BLK_M + idyA * M + idxA) - 1;
+    const T* offs_dB = B + bly * BLK_N * K + idyB * K + idxB;
+    int boundB = (K * (N - 1) + K) - (bly * BLK_N * K + idyB * K + idxB) - 1;
+
+    int m, n, k, kk;
+
+    #pragma unroll
+    for (n = 0; n < THR_N; n++) {
+        #pragma unroll
+        for (m = 0 ; m < THR_M; m++) {
+            rC[n][m] = 0;
+        }
+    }
+
+    // blockwise transpose to transpose load
+    #pragma unroll
+    for (n = 0; n < BLK_K; n += DIM_YA) {
+        #pragma unroll
+        for (m = 0; m < BLK_M; m += DIM_XA) {
+            sA[n + idyA][m + idxA] = fetch(offs_dA, M, m, n, boundA);
+        }
+    }
+    // blockwise transpose to transpose load
+    #pragma unroll
+    for (n = 0; n < BLK_N; n += DIM_YB) {
+        #pragma unroll
+        for (m = 0; m < BLK_K; m += DIM_XB) {
+            sB[n + idyB][m + idxB] = fetch(offs_dB, K, m, n, boundB);
+        }
+    }
+    __syncthreads();
+
+    for (kk = 0; kk < K - BLK_K; kk += BLK_K)
+    {
+        offs_dA += BLK_K * M;
+        boundA -= BLK_K * M;
+        offs_dB += BLK_K;
+        boundB -= BLK_K;
+
+        #pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++) {
+            #pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++) {
+                ra[n][m] = fetch(offs_dA, M, m * DIM_XA, n * DIM_YA, boundA);
+            }
+        }
+
+        #pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++) {
+            #pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++) {
+                rb[n][m] = fetch(offs_dB, K, m * DIM_XB, n * DIM_YB, boundB);
+            }
+        }
+
+        // multiply
+        #pragma unroll
+        for (k = 0; k < BLK_K; k++)
+        {
+            #pragma unroll
+            for (m = 0; m < THR_M; m++) {
+                rA[m] = sA[k][m * DIM_X + idx];
+            }
+
+            #pragma unroll
+            for (n = 0; n < THR_N; n++) {
+                rB[n] = sB[n * DIM_Y + idy][k];
+            }
+
+            // HIP is strange...
+            #ifdef __HIP_DEVICE_COMPILE__
+            __syncthreads();
+            #endif
+
+            #pragma unroll
+            for (n = 0; n < THR_N; n++) {
+                #pragma unroll
+                for (m = 0; m < THR_M; m++) {
+                    rC[n][m] += rA[m] * rB[n];
+                }
+            }
+        }
+        __syncthreads();
+
+        // store A regs->smem
+        #pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++)
+        {
+            #pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++)
+            {
+                sA[n * DIM_YA + idyA][m * DIM_XA + idxA] = ra[n][m];
+            }
+        }
+
+        #pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++)
+        {
+            #pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++)
+            {
+                sB[n * DIM_YB + idyB][m * DIM_XB + idxB] = rb[n][m];
+            }
+        }
+        __syncthreads();
+    }
+
+    // Multiply last full (BLK_K) or partial block of columns of A and
+    // rows of B.
+    // It's okay that m,n exceed matrix bounds as all work is in registers
+    // or shared memory, and out-of-bounds rC[n][m] will not be saved later.
+
+    kk = K - kk;
+    #pragma unroll
+    for (k = 0; k < kk; k++)
+    {
+        #pragma unroll
+        for (m = 0; m < THR_M; m++) {
+            rA[m] = sA[k][m * DIM_X + idx];
+        }
+
+        #pragma unroll
+        for (n = 0; n < THR_N; n++) {
+            rB[n] = sB[n * DIM_Y + idy][k];
+        }
+
+        // HIP is strange...
+        #ifdef __HIP_DEVICE_COMPILE__
+        __syncthreads();
+        #endif
+
+        #pragma unroll
+        for (n = 0; n < THR_N; n++) {
+            #pragma unroll
+            for (m = 0; m < THR_M; m++) {
+                rC[n][m] += rA[m] * rB[n];
+            }
+        }
+    }
+
+    #pragma unroll
+    for (n = 0; n < THR_N; n++) {
+        int coord_dCn = bly * BLK_N + n * DIM_Y + idy;
+        #pragma unroll
+        for (m = 0; m < THR_M; m++) {
+            int coord_dCm = blx * BLK_M + m * DIM_X + idx;
+            if (coord_dCm < M && coord_dCn < N) {
+                C[coord_dCn * M + coord_dCm] = rC[n][m];
+            }
+        }
+    }
+}
+''' + code
+    for k, v in config:
+        code = '#define ' + k + ' ' + str(v) + '\n' + code
+    name_expressions = [f'{name}<bool>',
+                        f'{name}<signed char>',
+                        f'{name}<unsigned char>',
+                        f'{name}<short>',
+                        f'{name}<unsigned short>',
+                        f'{name}<int>',
+                        f'{name}<unsigned int>',
+                        f'{name}<long>',
+                        f'{name}<unsigned long>',
+                        f'{name}<long long>',
+                        f'{name}<unsigned long long>']
+    mod = cupy.RawModule(code=code, options=('--std=c++11',),
+                         name_expressions=name_expressions)
+    ker = mod.get_function(name + '<' + get_typename(dtype) + '>')
+    return ker
+
+
+@cupy._util.memoize(for_each_device=True)
+def _tensordot_core_int_kernel(config, dtype):
+    code = '''
+template<typename T>
+__global__ void _tensordot_core_int_kernel(
+        int M, int N, int K,
+        const T* A,
+        const T* B,
+        T * C)
+{
+    _tensordot_core_int_kernel_impl(M, N, K, A, B, C);
+}
+'''
+    name = '_tensordot_core_int_kernel'
+    return _tensordot_core_int_kernel_impl(config, dtype, code, name)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _tensordot_core_int_batched_kernel(config, dtype):
+    code = '''
+template<typename T>
+__global__ void _tensordot_core_int_batched_kernel(
+        int M, int N, int K,
+        const T* A[], const T* B[],
+        T* C[])
+{
+    int batchid = blockIdx.z;
+    _tensordot_core_int_kernel_impl(
+        M, N, K, A[batchid], B[batchid], C[batchid]
+    );
+}
+'''
+    name = '_tensordot_core_int_batched_kernel'
+    return _tensordot_core_int_kernel_impl(config, dtype, code, name)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _tensordot_core_int_strided_batched_kernel(config, dtype):
+    code = '''
+template<typename T>
+__global__ void _tensordot_core_int_strided_batched_kernel(
+        int M, int N, int K,
+        const T* A, long long strideA,
+        const T* B, long long strideB,
+        T * C, long long strideC)
+{
+    int batchid = blockIdx.z;
+    _tensordot_core_int_kernel_impl(
+        M, N, K,
+        &A[batchid * strideA],
+        &B[batchid * strideB],
+        &C[batchid * strideC]
+    );
+}
+'''
+    name = '_tensordot_core_int_strided_batched_kernel'
+    return _tensordot_core_int_kernel_impl(config, dtype, code, name)
+
+
+cdef tuple _integral_tensordot_core_config():
+    # TODO(leofang): autotune the tuning parameters here? See the discussion
+    # in this thread: https://groups.google.com/a/icl.utk.edu/g/magma-user/c/igc66uduTfI  # NOQA
+    dim_x=16
+    dim_y=16
+    blk_m=128
+    blk_n=128
+    blk_k=2
+    dim_xa=128
+    dim_ya=2
+    dim_xb=2
+    dim_yb=128
+    config = (('DIM_X', dim_x), ('DIM_Y', dim_y),
+              ('BLK_M', blk_m), ('BLK_N', blk_n), ('BLK_K', blk_k),
+              ('DIM_XA', dim_xa), ('DIM_YA', dim_ya),
+              ('DIM_XB', dim_xb), ('DIM_YB', dim_yb),
+              ('THR_M', blk_m // dim_x), ('THR_N', blk_n // dim_y))
+    return config, dim_x, dim_y, blk_m, blk_n
+
+
+cdef _ndarray_base _integral_tensordot_core(
+        _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m,
+        Py_ssize_t n, Py_ssize_t k, str dtype, const shape_t& ret_shape):
+
+    config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config()
+    kern = _tensordot_core_int_kernel(config, dtype)
+    args = (m, n, k, a, b, out)
+    grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1)
+    block = (dim_x, dim_y, 1)
+    kern(grid, block, args=args)
+    return out
+
+
+cdef _ndarray_base _integral_tensordot_core_batched(
+        _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m,
+        Py_ssize_t n, Py_ssize_t k, str dtype, Py_ssize_t batch_count):
+
+    config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config()
+    kern = _tensordot_core_int_batched_kernel(config, dtype)
+    block = (dim_x, dim_y, 1)
+    matPtrA = _mat_ptrs(a)
+    matPtrB = _mat_ptrs(b)
+    matPtrOut = _mat_ptrs(out)
+    max_batch_count = 65000
+    for i in range(0, batch_count, max_batch_count):
+        ibatch = min(max_batch_count, batch_count - i)
+        args = (
+            m, n, k, matPtrA[i:i + ibatch], matPtrB[i:i + ibatch],
+            matPtrOut[i:i + ibatch])
+        grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), ibatch)
+        kern(grid, block, args=args)
+    return out
+
+
+cdef _ndarray_base _integral_tensordot_core_strided_batched(
+        _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m,
+        Py_ssize_t n, Py_ssize_t k, str dtype, Py_ssize_t batch_count):
+
+    config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config()
+    kern = _tensordot_core_int_strided_batched_kernel(config, dtype)
+    block = (dim_x, dim_y, 1)
+    a = a.reshape((-1,) + a.shape[-2:])
+    b = b.reshape((-1,) + b.shape[-2:])
+    out = out.reshape((-1,) + out.shape[-2:])
+    strideA = _get_stride_for_strided_batched_gemm(a)
+    strideB = _get_stride_for_strided_batched_gemm(b)
+    strideOut = _get_stride_for_strided_batched_gemm(out)
+    max_batch_count = 65000
+    for i in range(0, batch_count, max_batch_count):
+        ibatch = min(max_batch_count, batch_count - i)
+        args = (
+            m, n, k, a[i:i + ibatch], strideA, b[i:i + ibatch], strideB,
+            out[i:i + ibatch], strideOut)
+        grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), ibatch)
+        kern(grid, block, args=args)
+    return out
+
+
+cdef _tensordot_core_mul_sum = ReductionKernel(
+    'S x, T y', 'U out',
+    'static_cast<U>(x) * static_cast<U>(y)',
+    'a + b', 'out = a', '0', '_tensordot_core_mul_sum')
+
+
+cpdef get_compute_type(dtype):
+    global compute_types
+    cdef int index = to_compute_type_index(dtype)
+    if compute_types[index] == COMPUTE_TYPE_TBD:
+        compute_type = COMPUTE_TYPE_DEFAULT
+        dtype_char = numpy.dtype(dtype).char
+        if dtype_char in 'fF' and int(os.getenv('CUPY_TF32', '0')) > 0:
+            compute_type = COMPUTE_TYPE_TF32
+        set_compute_type(dtype, compute_type)
+    return compute_types[index]
+
+
+@cython.profile(False)
+cpdef inline tuple _mat_to_cublas_contiguous(
+        _ndarray_base a, Py_ssize_t trans):
+    assert a.ndim == 2
+    if a._f_contiguous:
+        # builtin max function is not used for Cython 0.23
+        lda = a._strides[1] // a.itemsize
+        if lda < a._shape[0]:
+            lda = a._shape[0]
+        return a, trans, lda
+    if not a._c_contiguous:
+        a = a.copy()
+    return a, 1 - trans, a._strides[0] // a.itemsize
+
+
+cpdef _ndarray_base dot(
+        _ndarray_base a, _ndarray_base b, _ndarray_base out=None):
+    cdef Py_ssize_t a_ndim, b_ndim, a_axis, b_axis, n, m, k
+    cdef bint input_a_is_vec, input_b_is_vec
+    cdef shape_t ret_shape, shape
+
+    a_ndim = a._shape.size()
+    b_ndim = b._shape.size()
+
+    if out is not None:
+        if numpy.result_type(a.dtype, b.dtype) != out.dtype:
+            raise ValueError('Not supported dtype combination.')
+        if not out._c_contiguous:
+            raise ValueError('Output array must be C-contiguous')
+
+    if a_ndim == 0 or b_ndim == 0:
+        return _math._multiply(a, b, out=out)
+
+    input_a_is_vec = a_ndim == 1
+    input_b_is_vec = b_ndim == 1
+    if input_a_is_vec:
+        shape.clear()
+        shape.push_back(1)
+        shape.push_back(a.size)
+        a = _manipulation._reshape(a, shape)
+        a_ndim = 2
+    if input_b_is_vec:
+        shape.clear()
+        shape.push_back(b.size)
+        shape.push_back(1)
+        b = _manipulation._reshape(b, shape)
+        b_ndim = 2
+
+    a_axis = a_ndim - 1
+    b_axis = b_ndim - 2
+
+    if a._shape[a_axis] != b._shape[b_axis]:
+        raise ValueError('Axis dimension mismatch')
+
+    if a_axis:
+        a = _manipulation.rollaxis(a, a_axis, 0)
+    if b_axis:
+        b = _manipulation.rollaxis(b, b_axis, 0)
+
+    k = a._shape[0]
+    if k != 0:
+        m = b.size // k
+        n = a.size // k
+    else:
+        # When k==0, the function must return a matrix filled with zero
+        # like NumPy.
+        m = 0
+        n = 0
+
+    if not input_a_is_vec:
+        ret_shape.insert(ret_shape.end(), a._shape.begin() + 1, a._shape.end())
+    if not input_b_is_vec:
+        ret_shape.insert(ret_shape.end(), b._shape.begin() + 1, b._shape.end())
+    if out is not None:
+        # TODO(kataoka): Make the condition strict
+        if k != 0 and out.size != n * m:
+            raise ValueError('Output array has an invalid size')
+
+    return tensordot_core(a, b, out, n, m, k, ret_shape)
+
+
+cpdef _ndarray_base tensordot_core(
+        _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t n,
+        Py_ssize_t m, Py_ssize_t k, const shape_t& ret_shape):
+    # out, if specified, must be C-contiguous and have correct shape.
+    cdef shape_t shape
+    cdef Py_ssize_t transa, transb, lda, ldb
+    cdef intptr_t handle
+    cdef _ndarray_base copy_to_out = None
+    cdef str dtype = a.dtype.char
+    cdef int compute_capability = int(device.get_compute_capability())
+    if dtype != b.dtype.char:
+        dtype = numpy.promote_types(dtype, b.dtype).char
+    if not a.size or not b.size:
+        if out is None:
+            out = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
+        out.fill(0)
+        return out
+
+    if out is not None:
+        assert out.flags.c_contiguous and out.dtype == dtype
+    cdef int ace
+    if m == 1 and n == 1:
+        if out is None:
+            out = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
+        c = _manipulation._reshape(out, ())
+        for ace in _accelerator._routine_accelerators:
+            # fast path using CUB or cuTENSOR
+            if ace in (_accelerator.ACCELERATOR_CUB,
+                       _accelerator.ACCELERATOR_CUTENSOR):
+                (a.ravel() * b.ravel()).sum(out=c)
+                break
+        else:
+            _tensordot_core_mul_sum(a.ravel(), b.ravel(), out=c)
+        return out
+
+    a = a.astype(dtype, order='K', casting=None, subok=None, copy=False)
+    b = b.astype(dtype, order='K', casting=None, subok=None, copy=False)
+    # It copies the operands if needed
+    if a._shape.size() != 2 or a._shape[0] != k or a._shape[1] != n:
+        shape.clear()
+        shape.push_back(k)
+        shape.push_back(n)
+        a = _manipulation._reshape(a, shape)
+    if b._shape.size() != 2 or b._shape[0] != k or b._shape[1] != m:
+        shape.clear()
+        shape.push_back(k)
+        shape.push_back(m)
+        b = _manipulation._reshape(b, shape)
+
+    # Be careful that cuBLAS uses the FORTRAN-order matrix representation.
+    # Matrix-Matrix product A^T * B
+    # c is C-contiguous while cuBLAS assumes F-contiguous inputs, so we
+    # compute C^T = B^T * A here.
+    a, transa, lda = _mat_to_cublas_contiguous(a, 0)
+    b, transb, ldb = _mat_to_cublas_contiguous(b, 1)
+
+    if out is None:
+        out = c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
+    elif (
+        _memory_range.may_share_bounds(out, a)
+        or _memory_range.may_share_bounds(out, b)
+    ):
+        copy_to_out = c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
+    else:
+        c = out
+
+    if c._shape.size() != 2 or c._shape[0] != n or c._shape[1] != m:
+        c = c.view()
+        c.shape = (n, m)
+
+    if dtype not in 'efdFD':
+        if transa:
+            a = a.T
+            a = _internal_ascontiguousarray(a)
+        if transb:
+            b = _internal_ascontiguousarray(b)
+        _integral_tensordot_core(b, a, c, m, n, k, dtype, ret_shape)
+        if copy_to_out is not None:
+            elementwise_copy(copy_to_out, out)
+        return out
+
+    global _cuda_runtime_version
+    if _cuda_runtime_version < 0:
+        _cuda_runtime_version = runtime.runtimeGetVersion()
+
+    if (
+        not runtime._is_hip_environment and
+        _cuda_runtime_version >= 11000 and
+        compute_capability >= 50
+    ):
+        tensordot_core_v11(transb, transa, m, n, k, b, ldb, a, lda, c, m)
+        if copy_to_out is not None:
+            elementwise_copy(copy_to_out, out)
+        return out
+
+    handle = device.get_cublas_handle()
+    if dtype == 'e':
+        coef_dtype = 'f'
+    else:
+        coef_dtype = dtype
+    one = numpy.array(1.0, dtype=coef_dtype)
+    zero = numpy.array(0.0, dtype=coef_dtype)
+    if runtime._is_hip_environment and dtype == 'e':
+        # On HIP, SgemmEx does not work for half precision
+        dtype = 'f'
+        a = a.astype(dtype, order='K', casting=None, subok=None, copy=True)
+        b = b.astype(dtype, order='K', casting=None, subok=None, copy=True)
+        c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None)
+        copy_to_out = c
+        warnings.warn('On ROCm/HIP, there is no specialized API to handle '
+                      'half precision floating numbers, so the computation '
+                      'will be done by casting to single precision')
+    if dtype == 'e':
+        use_tensor_core = (not runtime._is_hip_environment and
+                           _cuda_runtime_version >= 9000 and
+                           compute_capability >= 70)
+        if use_tensor_core:
+            cublas.setMathMode(handle, cublas.CUBLAS_TENSOR_OP_MATH)
+            cublas.gemmEx(
+                handle, <int>transb, <int> transa, <int>m, <int>n, <int>k,
+                one.ctypes.data, b.data.ptr, runtime.CUDA_R_16F, <int>ldb,
+                a.data.ptr, runtime.CUDA_R_16F, <int>lda, zero.ctypes.data,
+                c.data.ptr, runtime.CUDA_R_16F, <int>m, runtime.CUDA_R_32F,
+                cublas.CUBLAS_GEMM_DEFAULT_TENSOR_OP)
+            cublas.setMathMode(handle, cublas.CUBLAS_DEFAULT_MATH)
+        else:
+            cublas.sgemmEx(
+                handle, <int>transb, <int> transa, <int>m, <int>n, <int>k,
+                one.ctypes.data, b.data.ptr, runtime.CUDA_R_16F, <int>ldb,
+                a.data.ptr, runtime.CUDA_R_16F, <int>lda, zero.ctypes.data,
+                c.data.ptr, runtime.CUDA_R_16F, <int>m)
+    elif dtype == 'f':
+        cublas.sgemmEx(
+            handle, <int>transb, <int> transa, <int>m, <int>n, <int>k,
+            one.ctypes.data, b.data.ptr, runtime.CUDA_R_32F, <int>ldb,
+            a.data.ptr, runtime.CUDA_R_32F, <int>lda, zero.ctypes.data,
+            c.data.ptr, runtime.CUDA_R_32F, <int>m)
+    elif dtype == 'd':
+        cublas.dgemm(
+            handle, <int>transb, <int>transa, <int>m, <int>n, <int>k,
+            one.ctypes.data, b.data.ptr, <int>ldb, a.data.ptr, <int>lda,
+            zero.ctypes.data, c.data.ptr, <int>m)
+    elif dtype == 'F':
+        cublas.cgemm(
+            handle, <int>transb, <int>transa, <int>m, <int>n, <int>k,
+            one.ctypes.data, b.data.ptr, <int>ldb, a.data.ptr, <int>lda,
+            zero.ctypes.data, c.data.ptr, <int>m)
+    elif dtype == 'D':
+        cublas.zgemm(
+            handle, <int>transb, <int>transa, <int>m, <int>n, <int>k,
+            one.ctypes.data, b.data.ptr, <int>ldb, a.data.ptr, <int>lda,
+            zero.ctypes.data, c.data.ptr, <int>m)
+    else:
+        raise ValueError('Invalid dtype: %s' % str(dtype))
+    if copy_to_out is not None:
+        elementwise_copy(copy_to_out, out)
+    return out
+
+
+cpdef _ndarray_base tensordot_core_v11(
+        Py_ssize_t transa, Py_ssize_t transb, Py_ssize_t m, Py_ssize_t n,
+        Py_ssize_t k, _ndarray_base a, Py_ssize_t lda, _ndarray_base b,
+        Py_ssize_t ldb, _ndarray_base c, Py_ssize_t ldc):
+    cdef float one_f, zero_f
+    cdef double one_d, zero_d
+    cdef cuComplex one_F, zero_F
+    cdef cuDoubleComplex one_D, zero_D
+    cdef size_t one_ptr, zero_ptr
+
+    cdef int compute_capability = int(device.get_compute_capability())
+    cdef int compute_type = get_compute_type(c.dtype)
+    cdef int cublas_compute_type = -1
+    if c.dtype.char in 'efF':
+        if compute_type == COMPUTE_TYPE_PEDANTIC:
+            cublas_compute_type = cublas.CUBLAS_COMPUTE_32F_PEDANTIC
+        elif compute_type == COMPUTE_TYPE_TF32 and c.dtype.char in 'fF':
+            cublas_compute_type = cublas.CUBLAS_COMPUTE_32F_FAST_TF32
+        else:
+            cublas_compute_type = cublas.CUBLAS_COMPUTE_32F
+    elif c.dtype.char in 'dD':
+        if compute_type == COMPUTE_TYPE_PEDANTIC:
+            cublas_compute_type = cublas.CUBLAS_COMPUTE_64F_PEDANTIC
+        else:
+            cublas_compute_type = cublas.CUBLAS_COMPUTE_64F
+    else:
+        raise ValueError('Invalid dtype: {}'.format(c.dtype))
+
+    cdef int algo = cublas.CUBLAS_GEMM_DEFAULT
+    if ((compute_capability >= 80) or
+            (compute_capability >= 70 and c.dtype == 'e')):
+        algo = cublas.CUBLAS_GEMM_DEFAULT_TENSOR_OP
+
+    if cublas_compute_type in (cublas.CUBLAS_COMPUTE_32F,
+                               cublas.CUBLAS_COMPUTE_32F_PEDANTIC,
+                               cublas.CUBLAS_COMPUTE_32F_FAST_TF32):
+        if c.dtype.char in 'efd':
+            one_f = 1
+            zero_f = 0
+            one_ptr = <size_t>&one_f
+            zero_ptr = <size_t>&zero_f
+        else:
+            one_F = cuComplex(1, 0)
+            zero_F = cuComplex(0, 0)
+            one_ptr = <size_t>&one_F
+            zero_ptr = <size_t>&zero_F
+    elif cublas_compute_type in (cublas.CUBLAS_COMPUTE_64F,
+                                 cublas.CUBLAS_COMPUTE_64F_PEDANTIC):
+        if c.dtype.char in 'efd':
+            one_d = 1
+            zero_d = 0
+            one_ptr = <size_t>&one_d
+            zero_ptr = <size_t>&zero_d
+        else:
+            one_D = cuDoubleComplex(1, 0)
+            zero_D = cuDoubleComplex(0, 0)
+            one_ptr = <size_t>&one_D
+            zero_ptr = <size_t>&zero_D
+    else:
+        raise ValueError('Invalid cublas compute type: {}'
+                         .format(cublas_compute_type))
+
+    cdef int a_cuda_dtype = to_cuda_dtype(a.dtype, is_half_allowed=True)
+    cdef int b_cuda_dtype = to_cuda_dtype(b.dtype, is_half_allowed=True)
+    cdef int c_cuda_dtype = to_cuda_dtype(c.dtype, is_half_allowed=True)
+    cdef intptr_t handle = device.get_cublas_handle()
+    cublas.gemmEx(
+        handle, <int>transa, <int>transb, <int>m, <int>n, <int>k, one_ptr,
+        a.data.ptr, a_cuda_dtype, <int>lda, b.data.ptr, b_cuda_dtype, <int>ldb,
+        zero_ptr, c.data.ptr, c_cuda_dtype, <int>ldc, cublas_compute_type,
+        algo)
+
+
+cdef Py_ssize_t _get_stride_for_strided_batched_gemm(
+        _ndarray_base a) except? 0:
+    cdef int ndim = a._shape.size()
+    assert ndim > 2
+    assert a._c_contiguous
+    return a._shape[ndim - 2] * a._shape[ndim - 1]
+
+
+cdef _mat_ptrs_kernel = ElementwiseKernel(
+    'T base, T stride', 'T out',
+    'out = base + _ind.get()[_ind.ndim - 1] * stride', 'cupy_mat_ptrs',
+    reduce_dims=False)
+
+
+cpdef _ndarray_base _mat_ptrs(_ndarray_base a):
+    """Creates an array of pointers to matrices
+    Args:
+        a: A batch of matrices on GPU.
+           shape: (A, B, C) -> A ptrs to mat of size (B, C)
+           shape: (A_1, ..., A_N, B, C) -> A_1*...*A_N ptrs to mat of
+                  size (B, C)
+    Returns:
+        GPU array of pointers to matrices.
+    """
+    cdef int ndim = a._shape.size()
+    assert ndim > 2
+    cdef _ndarray_base idx
+    idx = _mat_ptrs_kernel(
+        a.data.ptr, a._strides[0],
+        core.ndarray((a._shape[0],), dtype=numpy.uintp))
+
+    for i in range(1, ndim - 2):
+        idx = _mat_ptrs_kernel(
+            idx[:, None], a._strides[i],
+            core.ndarray((idx.size, a._shape[i]), dtype=numpy.uintp))
+        idx = idx.ravel()
+    return idx
+
+
+cpdef _ndarray_base matmul(
+        _ndarray_base a, _ndarray_base b, _ndarray_base out=None):
+    """Matrix product of two arrays.
+
+    Returns the matrix product of two arrays and is the implementation of
+    the `@` operator introduced in Python 3.5 following PEP465.
+
+    The main difference against cupy.dot are the handling of arrays with more
+    than 2 dimensions. For more information see :func:`numpy.matmul`.
+
+    Args:
+        a (cupy.ndarray): The left argument.
+        b (cupy.ndarray): The right argument.
+        out (cupy.ndarray): Output array.
+
+    Returns:
+        cupy.ndarray: Output array.
+
+    .. seealso:: :func:`numpy.matmul`
+
+    """
+
+    cdef Py_ssize_t i, n, m, ka, kb, a_sh, b_sh, c_sh, ldc
+    cdef Py_ssize_t batchCount, a_part_outshape, b_part_outshape
+    cdef int orig_a_ndim, orig_b_ndim, a_ndim, b_ndim, ndim
+    cdef _ndarray_base ap, bp, cp, c_view
+    cdef bint use_broadcast
+
+    orig_a_ndim = a._shape.size()
+    orig_b_ndim = b._shape.size()
+    if orig_a_ndim == 0 or orig_b_ndim == 0:
+        raise ValueError('Scalar operands are not allowed, use \'*\' instead')
+
+    ndim = max(orig_a_ndim, orig_b_ndim)
+    if ndim <= 2:
+        if out is None:
+            return dot(a, b, out)
+        ret_dtype = numpy.promote_types(a.dtype, b.dtype)
+        if out._c_contiguous and ret_dtype == out.dtype:
+            return dot(a, b, out)
+        c = _ndarray_init(cupy.ndarray, out._shape, dtype=ret_dtype, obj=None)
+        dot(a, b, c)
+        elementwise_copy(c, out)
+        return out
+
+    orig_a = a
+    orig_b = b
+    a_part_outshape = b_part_outshape = 0
+    if orig_a_ndim == 1:
+        a = _manipulation._reshape(a, (1, a.size))
+    else:
+        a = a.view()
+        a_part_outshape = a._shape[orig_a_ndim - 2]
+    if orig_b_ndim == 1:
+        b = _manipulation._reshape(b, (b.size, 1))
+        ldc = 1
+    else:
+        b = b.view()
+        b_part_outshape = ldc = b._shape[orig_b_ndim - 1]
+
+    # expand dims
+    a_ndim = a._shape.size()
+    b_ndim = b._shape.size()
+    if a_ndim < ndim:
+        # TODO(niboshi): Confirm update_x_contiguity flags
+        a._set_shape_and_strides(
+            (1,) * (ndim - a_ndim) + a.shape,
+            (0,) * (ndim - a_ndim) + a.strides,
+            True, True)
+    if b_ndim < ndim:
+        # TODO(niboshi): Confirm update_x_contiguity flags
+        b._set_shape_and_strides(
+            (1,) * (ndim - b_ndim) + b.shape,
+            (0,) * (ndim - b_ndim) + b.strides,
+            True, True)
+
+    ret_dtype = numpy.promote_types(a.dtype, b.dtype)
+    dtype = ret_dtype
+    if dtype.char == 'e':
+        dtype = numpy.dtype('f')
+
+    a = ascontiguousarray(a, dtype)
+    b = ascontiguousarray(b, dtype)
+
+    # broadcast
+    batchCount = 1  # batchCount = numpy.prod(out_shape[:-2])
+    out_shape = []
+    use_broadcast = False
+    for i in range(0, ndim - 2):
+        a_sh = a._shape[i]
+        b_sh = b._shape[i]
+        if a_sh != b_sh and a_sh != 1 and b_sh != 1:
+            raise ValueError(
+                'operands could not be broadcast together with '
+                'remapped shapes')
+
+        if a_sh == 0 or b_sh == 0:
+            c_sh = 0
+        else:
+            c_sh = max(a_sh, b_sh)
+        batchCount *= c_sh
+        out_shape.append(c_sh)
+        if a_sh == 1 and c_sh > 1:
+            a._strides[i] = 0
+            a._shape[i] = c_sh
+            a._c_contiguous = a._f_contiguous = False
+            use_broadcast = True
+
+        if b_sh == 1 and c_sh > 1:
+            b._strides[i] = 0
+            b._shape[i] = c_sh
+            b._c_contiguous = b._f_contiguous = False
+            use_broadcast = True
+
+    if orig_a_ndim != 1:
+        out_shape.append(a_part_outshape)
+    if orig_b_ndim != 1:
+        out_shape.append(b_part_outshape)
+
+    # (A B)^T = B^T A^T
+    a, b = b, a
+
+    ka = a._shape[ndim - 2]
+    lda = n = a._shape[ndim - 1]
+    m = b._shape[ndim - 2]
+    ldb = kb = b._shape[ndim - 1]
+
+    if ka != kb:
+        raise ValueError(
+            'shapes ({}) and ({}) not aligned'.format(
+                ','.join([str(_) for _ in orig_a.shape]),
+                ','.join([str(_) for _ in orig_b.shape])))
+
+    if out is not None and out.shape != tuple(out_shape):
+        raise ValueError('Output array has an invalid size')
+
+    if a.size == 0 or b.size == 0:
+        if out is None:
+            return cupy.zeros(out_shape, ret_dtype)
+        else:
+            out.fill(0)
+            return out
+
+    if (
+        out is not None and out.dtype == dtype and out.flags.c_contiguous
+        and not _memory_range.may_share_bounds(out, a)
+        and not _memory_range.may_share_bounds(out, b)
+    ):
+        c = out
+    else:
+        c = core.ndarray(out_shape, dtype=dtype)
+        if out is None:
+            if dtype == ret_dtype:
+                out = c
+            else:
+                out = core.ndarray(out_shape, dtype=ret_dtype)
+
+    if orig_a_ndim == 1 or orig_b_ndim == 1:
+        c_view = c.view()
+        if orig_b_ndim == 1:
+            c_view._shape.push_back(1)
+            c_view._strides.push_back(0)
+        if orig_a_ndim == 1:
+            c_view._shape.insert(c_view._shape.end() - 1, 1)
+            c_view._strides.insert(c_view._strides.end() - 1, 0)
+        assert c_view._c_contiguous
+        c_view._update_f_contiguity()
+    else:
+        c_view = c
+
+    if dtype.char not in 'efdFD':
+        if not use_broadcast:
+            _integral_tensordot_core_strided_batched(
+                a, b, c_view, n, m, ka, dtype.char, batchCount)
+        else:
+            _integral_tensordot_core_batched(
+                a, b, c_view, n, m, ka, dtype.char, batchCount)
+        if out is not c:
+            elementwise_copy(c, out)
+        return out
+
+    global _cuda_runtime_version
+    if _cuda_runtime_version < 0:
+        _cuda_runtime_version = runtime.runtimeGetVersion()
+
+    cdef intptr_t handle = device.get_cublas_handle()
+    cdef int cuda_dtype = to_cuda_dtype(dtype)
+    cdef int algo = cublas.CUBLAS_GEMM_DEFAULT
+
+    one = numpy.array(1, dtype=dtype)
+    zero = numpy.array(0, dtype=dtype)
+    if not use_broadcast:
+        strideA = _get_stride_for_strided_batched_gemm(a)
+        strideB = _get_stride_for_strided_batched_gemm(b)
+        strideC = _get_stride_for_strided_batched_gemm(c_view)
+        if dtype.char in 'fFdD':
+            cublas.gemmStridedBatchedEx(
+                handle,
+                0,  # transa
+                0,  # transb
+                n, m, ka, one.ctypes.data,
+                a.data.ptr, cuda_dtype, lda, strideA,
+                b.data.ptr, cuda_dtype, ldb, strideB,
+                zero.ctypes.data,
+                c_view.data.ptr, cuda_dtype, ldc, strideC,
+                batchCount, cuda_dtype, algo)
+        else:
+            raise TypeError(dtype, a.dtype, b.dtype)
+    else:
+        ap = _mat_ptrs(a)
+        bp = _mat_ptrs(b)
+        cp = _mat_ptrs(c_view)
+        if dtype == numpy.float32:
+            cublas.sgemmBatched(
+                handle,
+                0,  # transa
+                0,  # transb
+                n, m, ka, one.ctypes.data,
+                ap.data.ptr, lda,
+                bp.data.ptr, ldb,
+                zero.ctypes.data, cp.data.ptr, ldc, batchCount)
+        elif dtype == numpy.float64:
+            cublas.dgemmBatched(
+                handle,
+                0,  # transa
+                0,  # transb
+                n, m, ka, one.ctypes.data,
+                ap.data.ptr, lda,
+                bp.data.ptr, ldb,
+                zero.ctypes.data, cp.data.ptr, ldc, batchCount)
+        elif dtype == numpy.complex64:
+            cublas.cgemmBatched(
+                handle,
+                0,  # transa
+                0,  # transb
+                n, m, ka, one.ctypes.data,
+                ap.data.ptr, lda,
+                bp.data.ptr, ldb,
+                zero.ctypes.data, cp.data.ptr, ldc, batchCount)
+        elif dtype == numpy.complex128:
+            cublas.zgemmBatched(
+                handle,
+                0,  # transa
+                0,  # transb
+                n, m, ka, one.ctypes.data,
+                ap.data.ptr, lda,
+                bp.data.ptr, ldb,
+                zero.ctypes.data, cp.data.ptr, ldc, batchCount)
+        else:
+            raise TypeError(dtype, a.dtype, b.dtype)
+
+    if out is not c:
+        elementwise_copy(c, out)
+    return out
--- a/cupy/_core/_routines_logic.pxd
+++ b/cupy/_core/_routines_logic.pxd
+from cupy._core.core cimport _ndarray_base
+
+
+cdef _ndarray_base _ndarray_all(_ndarray_base self, axis, out, keepdims)
+cdef _ndarray_base _ndarray_any(_ndarray_base self, axis, out, keepdims)
+cdef _ndarray_base _ndarray_greater(_ndarray_base self, other)
+cdef _ndarray_base _ndarray_greater_equal(_ndarray_base self, other)
+cdef _ndarray_base _ndarray_less(_ndarray_base self, other)
+cdef _ndarray_base _ndarray_less_equal(_ndarray_base self, other)
+cdef _ndarray_base _ndarray_equal(_ndarray_base self, other)
+cdef _ndarray_base _ndarray_not_equal(_ndarray_base self, other)
--- a/cupy/_core/_routines_logic.pyx
+++ b/cupy/_core/_routines_logic.pyx
+from cupy._core._kernel import create_ufunc
+from cupy._core._reduction import create_reduction_func
+
+from cupy._core.core cimport _ndarray_base
+
+
+cdef _ndarray_base _ndarray_all(_ndarray_base self, axis, out, keepdims):
+    return _all(self, axis=axis, out=out, keepdims=keepdims)
+
+
+cdef _ndarray_base _ndarray_any(_ndarray_base self, axis, out, keepdims):
+    return _any(self, axis=axis, out=out, keepdims=keepdims)
+
+
+cdef _ndarray_base _ndarray_greater(_ndarray_base self, other):
+    return _greater(self, other)
+
+
+cdef _ndarray_base _ndarray_greater_equal(_ndarray_base self, other):
+    return _greater_equal(self, other)
+
+
+cdef _ndarray_base _ndarray_less(_ndarray_base self, other):
+    return _less(self, other)
+
+
+cdef _ndarray_base _ndarray_less_equal(_ndarray_base self, other):
+    return _less_equal(self, other)
+
+
+cdef _ndarray_base _ndarray_equal(_ndarray_base self, other):
+    return _equal(self, other)
+
+
+cdef _ndarray_base _ndarray_not_equal(_ndarray_base self, other):
+    return _not_equal(self, other)
+
+
+cdef _all = create_reduction_func(
+    'cupy_all',
+    ('?->?', 'B->?', 'h->?', 'H->?', 'i->?', 'I->?', 'l->?', 'L->?',
+     'q->?', 'Q->?', 'e->?', 'f->?', 'd->?', 'F->?', 'D->?'),
+    ('in0 != type_in0_raw(0)', 'a & b', 'out0 = a', 'bool'),
+    'true', '')
+
+
+cdef _any = create_reduction_func(
+    'cupy_any',
+    ('?->?', 'B->?', 'h->?', 'H->?', 'i->?', 'I->?', 'l->?', 'L->?',
+     'q->?', 'Q->?', 'e->?', 'f->?', 'd->?', 'F->?', 'D->?'),
+    ('in0 != type_in0_raw(0)', 'a | b', 'out0 = a', 'bool'),
+    'false', '')
+
+
+cpdef create_comparison(name, op, doc='', no_complex_dtype=True):
+
+    if no_complex_dtype:
+        ops = ('??->?', 'bb->?', 'BB->?', 'hh->?', 'HH->?', 'ii->?', 'II->?',
+               'll->?', 'LL->?', 'qq->?', 'QQ->?', 'ee->?', 'ff->?', 'dd->?')
+    else:
+        ops = ('??->?', 'bb->?', 'BB->?', 'hh->?', 'HH->?', 'ii->?', 'II->?',
+               'll->?', 'LL->?', 'qq->?', 'QQ->?', 'ee->?', 'ff->?', 'dd->?',
+               'FF->?', 'DD->?')
+
+    return create_ufunc(
+        'cupy_' + name,
+        ops,
+        'out0 = in0 %s in1' % op,
+        doc=doc)
+
+
+cdef _greater = create_comparison(
+    'greater', '>',
+    '''Tests elementwise if ``x1 > x2``.
+
+    .. seealso:: :data:`numpy.greater`
+
+    ''',
+    no_complex_dtype=False)
+
+
+cdef _greater_equal = create_comparison(
+    'greater_equal', '>=',
+    '''Tests elementwise if ``x1 >= x2``.
+
+    .. seealso:: :data:`numpy.greater_equal`
+
+    ''',
+    no_complex_dtype=False)
+
+
+cdef _less = create_comparison(
+    'less', '<',
+    '''Tests elementwise if ``x1 < x2``.
+
+    .. seealso:: :data:`numpy.less`
+
+    ''',
+    no_complex_dtype=False)
+
+
+cdef _less_equal = create_comparison(
+    'less_equal', '<=',
+    '''Tests elementwise if ``x1 <= x2``.
+
+    .. seealso:: :data:`numpy.less_equal`
+
+    ''',
+    no_complex_dtype=False)
+
+
+cdef _equal = create_comparison(
+    'equal', '==',
+    '''Tests elementwise if ``x1 == x2``.
+
+    .. seealso:: :data:`numpy.equal`
+
+    ''',
+    no_complex_dtype=False)
+
+
+cdef _not_equal = create_comparison(
+    'not_equal', '!=',
+    '''Tests elementwise if ``x1 != x2``.
+
+    .. seealso:: :data:`numpy.equal`
+
+    ''',
+    no_complex_dtype=False)
+
+
+# Variables to expose to Python
+# (cythonized data cannot be exposed to Python, even with cpdef.)
+all = _all
+any = _any
+greater = _greater
+greater_equal = _greater_equal
+less = _less
+less_equal = _less_equal
+equal = _equal
+not_equal = _not_equal
--- a/cupy/_core/_routines_manipulation.pxd
+++ b/cupy/_core/_routines_manipulation.pxd
+from libcpp cimport vector
+
+from cupy._core._carray cimport shape_t
+from cupy._core._carray cimport strides_t
+from cupy._core.core cimport _ndarray_base
+
+
+cdef class broadcast:
+    cdef:
+        readonly tuple values
+        readonly tuple shape
+        readonly Py_ssize_t size
+        readonly Py_ssize_t nd
+
+
+cdef _ndarray_shape_setter(_ndarray_base self, newshape)
+cdef _ndarray_base _ndarray_reshape(_ndarray_base self, tuple shape, order)
+cdef _ndarray_base _ndarray_transpose(_ndarray_base self, tuple axes)
+cdef _ndarray_base _ndarray_swapaxes(
+    _ndarray_base self, Py_ssize_t axis1, Py_ssize_t axis2)
+cdef _ndarray_base _ndarray_flatten(_ndarray_base self, order)
+cdef _ndarray_base _ndarray_ravel(_ndarray_base self, order)
+cdef _ndarray_base _ndarray_squeeze(_ndarray_base self, axis)
+cdef _ndarray_base _ndarray_repeat(_ndarray_base self, repeats, axis)
+
+cpdef _ndarray_base _expand_dims(_ndarray_base a, tuple axis)
+cpdef _ndarray_base moveaxis(_ndarray_base a, source, destination)
+cpdef _ndarray_base _move_single_axis(
+    _ndarray_base a, Py_ssize_t source, Py_ssize_t destination)
+cpdef _ndarray_base rollaxis(
+    _ndarray_base a, Py_ssize_t axis, Py_ssize_t start=*)
+cpdef _ndarray_base broadcast_to(_ndarray_base array, shape)
+cpdef _ndarray_base _reshape(_ndarray_base self, const shape_t &shape_spec)
+cpdef _ndarray_base _T(_ndarray_base self)
+cpdef _ndarray_base _transpose(
+    _ndarray_base self, const vector.vector[Py_ssize_t] &axes)
+cpdef _ndarray_base _concatenate(
+    list arrays, Py_ssize_t axis, tuple shape, _ndarray_base out, str casting)
+cpdef _ndarray_base concatenate_method(
+    tup, int axis, _ndarray_base out=*, dtype=*, casting=*)
--- a/cupy/_core/_routines_manipulation.pyx
+++ b/cupy/_core/_routines_manipulation.pyx
+# distutils: language = c++
+import functools
+
+import numpy
+
+from cupy._core._kernel import ElementwiseKernel
+from cupy._core._ufuncs import elementwise_copy
+import cupy._core.core as core
+
+cimport cpython  # NOQA
+cimport cython  # NOQA
+from libcpp cimport vector
+
+from cupy._core._dtype cimport get_dtype, _raise_if_invalid_cast
+from cupy._core cimport core
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport internal
+from cupy._core._kernel cimport _check_peer_access, _preprocess_args
+
+from cupy.cuda import device
+
+
+@cython.final
+cdef class broadcast:
+    """Object that performs broadcasting.
+
+    CuPy actually uses this class to support broadcasting in various
+    operations. Note that this class does not provide an iterator.
+
+    Args:
+        arrays (tuple of arrays): Arrays to be broadcasted.
+
+    Attributes:
+        ~broadcast.shape (tuple of ints): The broadcasted shape.
+        nd (int): Number of dimensions of the broadcasted shape.
+        ~broadcast.size (int): Total size of the broadcasted shape.
+        values (list of arrays): The broadcasted arrays.
+
+    .. seealso:: :class:`numpy.broadcast`
+
+    """
+
+    def __init__(self, *arrays):
+        cdef shape_t shape
+        cdef list val = list(arrays)
+        internal._broadcast_core(val, shape)
+        self.values = tuple(val)
+        self.shape = tuple(shape)
+        self.nd = <Py_ssize_t>shape.size()
+        self.size = internal.prod(shape)
+
+
+# _ndarray_base members
+
+
+cdef _ndarray_shape_setter(_ndarray_base self, newshape):
+    cdef shape_t shape, strides
+    if not cpython.PySequence_Check(newshape):
+        newshape = (newshape,)
+    shape = internal.infer_unknown_dimension(newshape, self.size)
+    _get_strides_for_nocopy_reshape(self, shape, strides)
+    if strides.size() != shape.size():
+        raise AttributeError(
+            'Incompatible shape for in-place modification. Use `.reshape()` '
+            'to make a copy with the desired shape.')
+    self._set_shape_and_strides(shape, strides, False, True)
+
+
+cdef _ndarray_base _ndarray_reshape(_ndarray_base self, tuple shape, order):
+    cdef int order_char = internal._normalize_order(order, False)
+
+    if len(shape) == 1 and cpython.PySequence_Check(shape[0]):
+        shape = tuple(shape[0])
+
+    if order_char == b'A':
+        if self._f_contiguous and not self._c_contiguous:
+            order_char = b'F'
+        else:
+            order_char = b'C'
+    if order_char == b'C':
+        return _reshape(self, shape)
+    else:
+        # TODO(grlee77): Support order within _reshape instead
+
+        # The Fortran-ordered case is equivalent to:
+        #     1.) reverse the axes via transpose
+        #     2.) C-ordered reshape using reversed shape
+        #     3.) reverse the axes via transpose
+        return _T(_reshape(_T(self), shape[::-1]))
+
+
+cdef _ndarray_base _ndarray_transpose(_ndarray_base self, tuple axes):
+    if len(axes) == 0:
+        return _T(self)
+    if len(axes) == 1:
+        a = axes[0]
+        if a is None:
+            return _T(self)
+        elif cpython.PySequence_Check(a):
+            axes = tuple(a)
+    return _transpose(self, axes)
+
+
+cdef _ndarray_base _ndarray_swapaxes(
+        _ndarray_base self, Py_ssize_t axis1, Py_ssize_t axis2):
+    cdef Py_ssize_t ndim = self.ndim
+    cdef vector.vector[Py_ssize_t] axes
+    if axis1 < -ndim or axis1 >= ndim or axis2 < -ndim or axis2 >= ndim:
+        raise ValueError('Axis out of range')
+    axis1 %= ndim
+    axis2 %= ndim
+    for i in range(ndim):
+        axes.push_back(i)
+    axes[axis1], axes[axis2] = axes[axis2], axes[axis1]
+    return _transpose(self, axes)
+
+
+cdef _ndarray_base _ndarray_flatten(_ndarray_base self, order):
+    cdef int order_char
+    cdef vector.vector[Py_ssize_t] axes
+
+    order_char = internal._normalize_order(order, True)
+    if order_char == b'A':
+        if self._f_contiguous and not self._c_contiguous:
+            order_char = b'F'
+        else:
+            order_char = b'C'
+    if order_char == b'C':
+        return _ndarray_flatten_order_c(self)
+    elif order_char == b'F':
+        return _ndarray_flatten_order_c(_T(self))
+    elif order_char == b'K':
+        axes = _npyiter_k_order_axes(self.strides)
+        return _ndarray_flatten_order_c(_transpose(self, axes))
+
+
+cdef _ndarray_base _ndarray_flatten_order_c(_ndarray_base self):
+    newarray = self.copy(order='C')
+    newarray._shape.assign(<Py_ssize_t>1, self.size)
+    newarray._strides.assign(<Py_ssize_t>1,
+                             <Py_ssize_t>self.itemsize)
+    newarray._c_contiguous = True
+    newarray._f_contiguous = True
+    return newarray
+
+
+cdef vector.vector[Py_ssize_t] _npyiter_k_order_axes(strides_t& strides):
+    # output transpose axes such that
+    # x.flatten(order="K") == x.transpose(axes).flatten(order="C")
+    # by reproducing `npyiter_find_best_axis_ordering`
+    # in numpy/core/src/multiarray/nditer_constr.c
+
+    # Note that `flatten` and `ravel` should use this function for order="K",
+    # while `copy(order="K")` should use `internal._get_strides_for_order_K`.
+    cdef vector.vector[Py_ssize_t] axes
+    cdef Py_ssize_t stride0, stride1
+    cdef int ndim, i0, i1, ipos, k
+    ndim = strides.size()
+    for i0 in reversed(range(ndim)):
+        stride0 = abs(strides[i0])
+        if stride0 == 0:  # ambiguous
+            axes.insert(axes.begin(), i0)
+            continue
+        ipos = 0
+        for k, i1 in enumerate(axes):
+            stride1 = abs(strides[i1])
+            if stride1 == 0:  # ambiguous
+                continue
+            elif stride1 <= stride0:  # shouldswap = false
+                break
+            else:  # shouldswap = true
+                ipos = k + 1
+        axes.insert(axes.begin() + ipos, i0)
+    return axes
+
+
+cdef _ndarray_base _ndarray_ravel(_ndarray_base self, order):
+    cdef int order_char
+    cdef shape_t shape
+    cdef vector.vector[Py_ssize_t] axes
+    shape.push_back(self.size)
+
+    order_char = internal._normalize_order(order, True)
+    if order_char == b'A':
+        if self._f_contiguous and not self._c_contiguous:
+            order_char = b'F'
+        else:
+            order_char = b'C'
+    if order_char == b'C':
+        return _reshape(self, shape)
+    elif order_char == b'F':
+        return _reshape(_T(self), shape)
+    elif order_char == b'K':
+        axes = _npyiter_k_order_axes(self.strides)
+        return _reshape(_transpose(self, axes), shape)
+
+
+cdef _ndarray_base _ndarray_squeeze(_ndarray_base self, axis):
+    cdef vector.vector[char] axis_flags
+    cdef shape_t newshape
+    cdef strides_t newstrides
+    cdef Py_ssize_t ndim, naxes, _axis
+
+    ndim = self._shape.size()
+    axis_flags = vector.vector[char](ndim, 0)
+
+    # Convert axis to boolean flag.
+    if axis is None:
+        for idim in range(ndim):
+            if self._shape[idim] == 1:
+                axis_flags[idim] = 1
+    elif isinstance(axis, tuple):
+        naxes = <Py_ssize_t>len(axis)
+        for i in range(naxes):
+            _axis = internal._normalize_axis_index(<Py_ssize_t>axis[i], ndim)
+            if axis_flags[_axis] == 1:
+                raise ValueError('duplicate value in \'axis\'')
+            axis_flags[_axis] = 1
+    else:
+        _axis = <Py_ssize_t>axis
+        if ndim == 0 and (_axis == 0 or _axis == -1):
+            # Special case letting axis={-1,0} slip through for scalars,
+            # for backwards compatibility reasons.
+            pass
+        else:
+            _axis = internal._normalize_axis_index(_axis, ndim)
+            axis_flags[_axis] = 1
+
+    # Verify that the axes requested are all of size one
+    any_ones = 0
+    for idim in range(ndim):
+        if axis_flags[idim] != 0:
+            if self._shape[idim] == 1:
+                any_ones = 1
+            else:
+                raise ValueError('cannot select an axis to squeeze out '
+                                 'which has size not equal to one')
+
+    # If there were no axes to squeeze out, return the same array
+    if any_ones == 0:
+        return self
+
+    for i in range(ndim):
+        if axis_flags[i] == 0:
+            newshape.push_back(self._shape[i])
+            newstrides.push_back(self._strides[i])
+
+    v = self.view()
+    # TODO(niboshi): Confirm update_x_contiguity flags
+    v._set_shape_and_strides(newshape, newstrides, False, True)
+    return v
+
+
+cdef _ndarray_base _ndarray_repeat(_ndarray_base self, repeats, axis):
+    return _repeat(self, repeats, axis)
+
+
+# exposed
+
+
+cpdef _ndarray_base _expand_dims(_ndarray_base a, tuple axis):
+    cdef vector.vector[Py_ssize_t] normalized_axis
+    cdef out_ndim = a.ndim + len(axis)
+    cdef shape_t a_shape = a.shape, out_shape
+    _normalize_axis_tuple(axis, out_ndim, normalized_axis)
+    out_shape.assign(out_ndim, 0)
+    cdef Py_ssize_t i, j
+    for i in normalized_axis:
+        out_shape[i] = 1
+    j = 0
+    for i in range(out_ndim):
+        if out_shape[i] == 1:
+            continue
+        out_shape[i] = a_shape[j]
+        j += 1
+    return _reshape(a, out_shape)
+
+
+cpdef _ndarray_base moveaxis(_ndarray_base a, source, destination):
+    cdef shape_t src, dest
+    cdef Py_ssize_t ndim = a.ndim
+    _normalize_axis_tuple(source, ndim, src)
+    _normalize_axis_tuple(destination, ndim, dest)
+
+    if src.size() != dest.size():
+        raise ValueError('`source` and `destination` arguments must have '
+                         'the same number of elements')
+
+    cdef vector.vector[Py_ssize_t] order
+    cdef Py_ssize_t i
+    for i in range(ndim):
+        if not _has_element(src, i):
+            order.push_back(i)
+
+    cdef Py_ssize_t d, s
+    for d, s in sorted(zip(dest, src)):
+        order.insert(order.begin() + d, s)
+
+    return _transpose(a, order)
+
+
+cpdef _ndarray_base _move_single_axis(
+        _ndarray_base a, Py_ssize_t source, Py_ssize_t destination):
+    """Like moveaxis, but supporting only integer source and destination."""
+    cdef Py_ssize_t ndim = a.ndim
+    source = internal._normalize_axis_index(source, ndim)
+    destination = internal._normalize_axis_index(destination, ndim)
+
+    if source == destination:
+        return a
+
+    cdef vector.vector[Py_ssize_t] order
+    cdef Py_ssize_t i
+    for i in range(ndim):
+        if i != source:
+            order.push_back(i)
+
+    order.insert(order.begin() + destination, source)
+    return _transpose(a, order)
+
+
+cpdef _ndarray_base rollaxis(
+        _ndarray_base a, Py_ssize_t axis, Py_ssize_t start=0):
+    cdef Py_ssize_t i, ndim = a.ndim
+    cdef vector.vector[Py_ssize_t] axes
+    if axis < 0:
+        axis += ndim
+    if start < 0:
+        start += ndim
+    if not (0 <= axis < ndim and 0 <= start <= ndim):
+        raise ValueError('Axis out of range')
+    if axis < start:
+        start -= 1
+    if axis == start:
+        return a
+    if ndim == 2:
+        return _transpose(a, axes)
+
+    for i in range(ndim):
+        axes.push_back(i)
+    axes.erase(axes.begin() + axis)
+    axes.insert(axes.begin() + start, axis)
+    return _transpose(a, axes)
+
+
+cpdef _ndarray_base _reshape(_ndarray_base self, const shape_t &shape_spec):
+    cdef shape_t shape
+    cdef strides_t strides
+    cdef _ndarray_base newarray
+    shape = internal.infer_unknown_dimension(shape_spec, self.size)
+    if internal.vector_equal(shape, self._shape):
+        return self.view()
+
+    _get_strides_for_nocopy_reshape(self, shape, strides)
+    if strides.size() == shape.size():
+        return self._view(type(self), shape, strides, False, True, self)
+    newarray = self.copy()
+    _get_strides_for_nocopy_reshape(newarray, shape, strides)
+
+    # TODO(niboshi): Confirm update_x_contiguity flags
+    newarray._set_shape_and_strides(shape, strides, False, True)
+    return newarray
+
+
+cpdef _ndarray_base _T(_ndarray_base self):
+    ret = self.view()
+    ret._shape.assign(self._shape.rbegin(), self._shape.rend())
+    ret._strides.assign(self._strides.rbegin(), self._strides.rend())
+    ret._c_contiguous = self._f_contiguous
+    ret._f_contiguous = self._c_contiguous
+    return ret
+
+
+cpdef _ndarray_base _transpose(
+        _ndarray_base self, const vector.vector[Py_ssize_t] &axes):
+    cdef vector.vector[Py_ssize_t] a_axes
+    cdef vector.vector[char] axis_flags
+    cdef Py_ssize_t i, ndim, axis, axes_size
+    cdef bint is_normal = True, is_trans = True
+
+    axes_size = axes.size()
+    if axes_size == 0:
+        return _T(self)
+
+    ndim = self._shape.size()
+    if axes_size != ndim:
+        raise ValueError("axes don't match array")
+
+    axis_flags.resize(ndim, 0)
+    for i in range(axes_size):
+        axis = axes[i]
+        if axis < -ndim or axis >= ndim:
+            raise numpy.AxisError(axis, ndim)
+        axis %= ndim
+        a_axes.push_back(axis)
+        if axis_flags[axis]:
+            raise ValueError('repeated axis in transpose')
+        axis_flags[axis] = 1
+        is_normal &= i == axis
+        is_trans &= ndim - 1 - i == axis
+
+    if is_normal:
+        return self.view()
+    if is_trans:
+        return _T(self)
+
+    ret = self.view()
+    ret._shape.clear()
+    ret._strides.clear()
+    for axis in a_axes:
+        ret._shape.push_back(self._shape[axis])
+        ret._strides.push_back(self._strides[axis])
+    ret._update_contiguity()
+    return ret
+
+
+cpdef array_split(_ndarray_base ary, indices_or_sections, Py_ssize_t axis):
+    cdef Py_ssize_t i, ndim, size, each_size, index, prev, stride
+    cdef Py_ssize_t num_large
+    cdef shape_t shape
+
+    ndim = ary.ndim
+    if -ndim > axis or ndim <= axis:
+        raise IndexError('Axis exceeds ndim')
+    if axis < 0:
+        axis += ndim
+    size = ary._shape[axis]
+
+    if numpy.isscalar(indices_or_sections):
+        each_size = (size - 1) // indices_or_sections
+        num_large = (size - 1) % indices_or_sections + 1
+        indices = [i * each_size + min(i, num_large)
+                   for i in range(1, indices_or_sections)]
+    else:
+        indices = [i if i >= 0 else size + i for i in indices_or_sections]
+
+    if len(indices) == 0:
+        return [ary]
+
+    # Make a copy of shape for each view
+    shape = ary._shape
+
+    prev = 0
+    ret = []
+    stride = ary._strides[axis]
+    if ary.size == 0:
+        stride = 0
+    for index in indices:
+        index = min(index, size)
+        shape[axis] = max(index - prev, 0)
+        v = ary.view()
+        v.data = ary.data + prev * stride
+        # TODO(niboshi): Confirm update_x_contiguity flags
+        v._set_shape_and_strides(shape, ary._strides, True, True)
+        ret.append(v)
+
+        prev = index
+
+    shape[axis] = size - prev
+    v = ary.view()
+    v.data = ary.data + prev * stride
+    # TODO(niboshi): Confirm update_x_contiguity flags
+    v._set_shape_and_strides(shape, ary._strides, True, True)
+    ret.append(v)
+
+    return ret
+
+
+cpdef _ndarray_base broadcast_to(_ndarray_base array, shape):
+    """Broadcast an array to a given shape.
+
+    .. seealso::
+        :func:`cupy.broadcast_to` for full documentation,
+        :meth:`numpy.broadcast_to`
+
+    """
+    shape = tuple(shape) if numpy.iterable(shape) else (shape,)
+    cdef int i, j, ndim = array._shape.size(), length = len(shape)
+    cdef Py_ssize_t sh, a_sh
+    if ndim > length:
+        raise ValueError(
+            'input operand has more dimensions than allowed by the axis '
+            'remapping')
+    cdef shape_t _shape = shape
+    cdef strides_t strides
+    strides.assign(length, 0)
+    for i in range(ndim):
+        j = i + length - ndim
+        sh = _shape[j]
+        a_sh = array._shape[i]
+        if sh == a_sh:
+            strides[j] = array._strides[i]
+        elif a_sh != 1:
+            raise ValueError(
+                'operands could not be broadcast together with shape {} and '
+                'requested shape {}'.format(array.shape, shape))
+
+    view = array.view()
+    # TODO(niboshi): Confirm update_x_contiguity flags
+    view._set_shape_and_strides(_shape, strides, True, True)
+    return view
+
+
+cpdef _ndarray_base _repeat(_ndarray_base a, repeats, axis=None):
+    """Repeat arrays along an axis.
+
+    Args:
+        a (cupy.ndarray): Array to transform.
+        repeats (int, list or tuple): The number of repeats.
+        axis (int): The axis to repeat.
+
+    Returns:
+        cupy.ndarray: Transformed array with repeats.
+
+    .. seealso:: :func:`numpy.repeat`
+
+    """
+    cdef _ndarray_base ret
+
+    if isinstance(repeats, _ndarray_base):
+        raise ValueError(
+            'cupy.ndaray cannot be specified as `repeats` argument.')
+
+    # Scalar and size 1 'repeat' arrays broadcast to any shape, for all
+    # other inputs the dimension must match exactly.
+    cdef bint broadcast = False
+    # numpy.issubdtype(1, numpy.integer) fails with old numpy like 1.13.3.
+    if (isinstance(repeats, int) or
+            (hasattr(repeats, 'dtype') and
+             numpy.issubdtype(repeats, numpy.integer))):
+        if repeats < 0:
+            raise ValueError(
+                '\'repeats\' should not be negative: {}'.format(repeats))
+        broadcast = True
+        repeats = [repeats]
+    elif cpython.PySequence_Check(repeats):
+        for rep in repeats:
+            if rep < 0:
+                raise ValueError(
+                    'all elements of \'repeats\' should not be negative: {}'
+                    .format(repeats))
+        if len(repeats) == 1:
+            broadcast = True
+    else:
+        raise ValueError(
+            '\'repeats\' should be int or sequence: {}'.format(repeats))
+
+    if axis is None:
+        if broadcast:
+            a = _reshape(a, (-1, 1))
+            ret = core.ndarray((a.size, repeats[0]), dtype=a.dtype)
+            if ret.size:
+                elementwise_copy(a, ret)
+            return ret.ravel()
+        else:
+            a = a.ravel()
+            axis = 0
+    else:
+        axis = internal._normalize_axis_index(axis, a.ndim)
+
+    if broadcast:
+        repeats = repeats * a._shape[axis]
+    elif a.shape[axis] != len(repeats):
+        raise ValueError(
+            '\'repeats\' and \'axis\' of \'a\' should be same length: {} != {}'
+            .format(a.shape[axis], len(repeats)))
+
+    ret_shape = list(a.shape)
+    ret_shape[axis] = sum(repeats)
+    ret = core.ndarray(ret_shape, dtype=a.dtype)
+    a_index = [slice(None)] * len(ret_shape)
+    ret_index = list(a_index)
+    offset = 0
+    for i in range(a._shape[axis]):
+        if repeats[i] == 0:
+            continue
+        a_index[axis] = slice(i, i + 1)
+        ret_index[axis] = slice(offset, offset + repeats[i])
+        # convert to tuple because cupy has a indexing bug
+        ret[tuple(ret_index)] = a[tuple(a_index)]
+        offset += repeats[i]
+    return ret
+
+
+cpdef _ndarray_base concatenate_method(
+        tup, int axis, _ndarray_base out=None, dtype=None,
+        casting='same_kind'):
+    cdef int ndim0
+    cdef int i
+    cdef _ndarray_base a, a0
+
+    if dtype is not None:
+        dtype = get_dtype(dtype)
+
+    dev_id = device.get_device_id()
+    arrays = _preprocess_args(dev_id, tup, False)
+
+    # Check if the input is not an empty sequence
+    if len(arrays) == 0:
+        raise ValueError('Cannot concatenate from empty tuple')
+
+    # Check types of the input arrays
+    for o in arrays:
+        if not isinstance(o, _ndarray_base):
+            raise TypeError('Only cupy arrays can be concatenated')
+
+    # Check ndim > 0 for the input arrays
+    for o in arrays:
+        a = o
+        if a._shape.size() == 0:
+            raise TypeError('zero-dimensional arrays cannot be concatenated')
+
+    # Check ndim consistency of the input arrays
+    a0 = arrays[0]
+    ndim0 = a0._shape.size()
+    for o in arrays[1:]:
+        a = o
+        if a._shape.size() != ndim0:
+            raise ValueError(
+                'All arrays to concatenate must have the same ndim')
+
+    # Check shape consistency of the input arrays, and compute the output shape
+    shape0 = a0._shape
+    axis = internal._normalize_axis_index(axis, ndim0)
+    for o in arrays[1:]:
+        a = o
+        for i in range(ndim0):
+            if i != axis and shape0[i] != a._shape[i]:
+                raise ValueError(
+                    'All arrays must have same shape except the axis to '
+                    'concatenate')
+        shape0[axis] += a._shape[axis]
+
+    # Compute the output dtype
+    if out is None:
+        if dtype is None:
+            dtype = a0.dtype
+            have_same_types = True
+            for o in arrays[1:]:
+                have_same_types = have_same_types and (o.dtype == dtype)
+            if not have_same_types:
+                dtype = functools.reduce(
+                    numpy.promote_types, set([a.dtype for a in arrays]))
+    else:
+        if dtype is not None:
+            raise TypeError('concatenate() only takes `out` or `dtype` as an '
+                            'argument, but both were provided.')
+        dtype = out.dtype
+
+    # Check casting rule
+    for o in arrays:
+        _raise_if_invalid_cast(o.dtype, dtype, casting)
+
+    # Prpare the output array
+    shape_t = tuple(shape0)
+    if out is None:
+        out = core.ndarray(shape_t, dtype=dtype)
+    else:
+        if len(out.shape) != len(shape_t):
+            raise ValueError('Output array has wrong dimensionality')
+        if out.shape != shape_t:
+            raise ValueError('Output array is the wrong shape')
+
+    return _concatenate(arrays, axis, shape_t, out, casting)
+
+
+cpdef _ndarray_base _concatenate(
+        list arrays, Py_ssize_t axis, tuple shape, _ndarray_base out,
+        str casting):
+    cdef _ndarray_base a, b
+    cdef Py_ssize_t i, aw, itemsize, axis_size
+    cdef bint all_same_type, same_shape_and_contiguous
+    # If arrays are large, Issuing each copy method is efficient.
+    cdef Py_ssize_t threshold_size = 2 * 1024 * 1024
+
+    dtype = out.dtype
+
+    if len(arrays) > 8:
+        all_same_type = True
+        same_shape_and_contiguous = True
+        axis_size = shape[axis] // len(arrays)
+        total_bytes = 0
+        itemsize = dtype.itemsize
+        for a in arrays:
+            if a.dtype != dtype:
+                all_same_type = False
+                break
+            if same_shape_and_contiguous:
+                same_shape_and_contiguous = (
+                    a._c_contiguous and a._shape[axis] == axis_size)
+            total_bytes += a.size * itemsize
+
+        if all_same_type and total_bytes < threshold_size * len(arrays):
+            return _concatenate_single_kernel(
+                arrays, axis, shape, dtype, same_shape_and_contiguous, out)
+
+    i = 0
+    slice_list = [slice(None)] * len(shape)
+    for a in arrays:
+        aw = a._shape[axis]
+        slice_list[axis] = slice(i, i + aw)
+        b = out[tuple(slice_list)]
+        elementwise_copy(a, b, casting=casting)
+        i += aw
+    return out
+
+
+cpdef Py_ssize_t size(_ndarray_base a, axis=None) except? -1:
+    """Returns the number of elements along a given axis.
+
+    Args:
+        a (ndarray): Input data.
+        axis (int or None): Axis along which the elements are counted.
+            When it is ``None``, it returns the total number of elements.
+
+    Returns:
+        int: Number of elements along the given axis.
+
+    """
+    cdef int index, ndim
+    if axis is None:
+        return a.size
+    else:
+        index = axis
+        ndim = a._shape.size()
+        if index < 0:
+            index += ndim
+        if not 0 <= index < ndim:
+            raise IndexError('index out of range')
+        return a._shape[index]
+
+
+# private
+
+
+cdef bint _has_element(const shape_t &source, Py_ssize_t n):
+    for i in range(source.size()):
+        if source[i] == n:
+            return True
+    return False
+
+
+cdef _get_strides_for_nocopy_reshape(
+        _ndarray_base a, const shape_t &newshape, strides_t &newstrides):
+    cdef Py_ssize_t size, itemsize, ndim, dim, last_stride
+    size = a.size
+    newstrides.clear()
+
+    itemsize = a.itemsize
+    if size == 1:
+        newstrides.assign(<Py_ssize_t>newshape.size(), itemsize)
+        return
+    if size == 0:
+        internal.get_contiguous_strides_inplace(
+            newshape, newstrides, itemsize, True, False)
+        return
+
+    cdef shape_t shape
+    cdef strides_t strides
+    internal.get_reduced_dims(a._shape, a._strides, itemsize, shape, strides)
+
+    ndim = shape.size()
+    dim = 0
+    last_stride = shape[0] * strides[0]
+    for i in range(newshape.size()):
+        size = newshape[i]
+        if size <= 1:
+            newstrides.push_back(last_stride)
+            continue
+        if dim >= ndim or shape[dim] % size != 0:
+            newstrides.clear()
+            break
+        shape[dim] //= size
+        last_stride = shape[dim] * strides[dim]
+        newstrides.push_back(last_stride)
+        if shape[dim] == 1:
+            dim += 1
+
+
+cdef _normalize_axis_tuple(axis, Py_ssize_t ndim, shape_t &ret):
+    """Normalizes an axis argument into a tuple of non-negative integer axes.
+
+    Arguments `argname` and `allow_duplicate` are not supported.
+
+    """
+    if numpy.isscalar(axis):
+        axis = (axis,)
+
+    for ax in axis:
+        ax = internal._normalize_axis_index(ax, ndim)
+        if _has_element(ret, ax):
+            # the message in `numpy.core.numeric.normalize_axis_tuple`
+            raise ValueError('repeated axis')
+        ret.push_back(ax)
+
+
+cdef _ndarray_base _concatenate_single_kernel(
+        list arrays, Py_ssize_t axis, tuple shape, dtype,
+        bint same_shape_and_contiguous, _ndarray_base out):
+    cdef _ndarray_base a, x
+    cdef Py_ssize_t base, cum, ndim
+    cdef int i, j
+    cdef Py_ssize_t[:] ptrs
+    cdef Py_ssize_t[:] cum_sizes
+    cdef Py_ssize_t[:, :] x_strides
+    cdef int device_id = device.get_device_id()
+
+    assert out is not None
+
+    ptrs = numpy.ndarray(len(arrays), numpy.int64)
+    for i, a in enumerate(arrays):
+        _check_peer_access(a, device_id)
+        ptrs[i] = a.data.ptr
+    x = core.array(ptrs)
+
+    if same_shape_and_contiguous:
+        base = internal.prod_sequence(shape[axis:]) // len(arrays)
+        _concatenate_kernel_same_size(x, base, out)
+        return out
+
+    ndim = len(shape)
+    x_strides = numpy.ndarray((len(arrays), ndim), numpy.int64)
+    cum_sizes = numpy.ndarray(len(arrays), numpy.int64)
+    cum = 0
+    for i, a in enumerate(arrays):
+        for j in range(ndim):
+            x_strides[i, j] = <int>a._strides[j]
+        cum_sizes[i] = cum
+        cum += <int>a._shape[axis]
+
+    _concatenate_kernel(
+        x, axis, core.array(cum_sizes), core.array(x_strides), out)
+    return out
+
+
+cdef _concatenate_kernel_same_size = ElementwiseKernel(
+    'raw P x, int64 base',
+    'T y',
+    '''
+    ptrdiff_t middle = i / base;
+    ptrdiff_t top = middle / x.size();
+    ptrdiff_t array_ind = middle - top * x.size();
+    ptrdiff_t offset = i + (top - middle) * base;
+    y = reinterpret_cast<T*>(x[array_ind])[offset];
+    ''',
+    'cupy_concatenate_same_size'
+)
+
+
+cdef _concatenate_kernel = ElementwiseKernel(
+    '''raw P x, int32 axis, raw int64 cum_sizes, raw int64 x_strides''',
+    'T y',
+    '''
+    ptrdiff_t axis_ind = _ind.get()[axis];
+    ptrdiff_t left = 0;
+    ptrdiff_t right = cum_sizes.size();
+
+    while (left < right - 1) {
+      ptrdiff_t m = (left + right) / 2;
+      if (axis_ind < cum_sizes[m]) {
+        right = m;
+      } else {
+        left = m;
+      }
+    }
+
+    ptrdiff_t array_ind = left;
+    axis_ind -= cum_sizes[left];
+    char* ptr = reinterpret_cast<char*>(x[array_ind]);
+    for (int j = _ind.ndim - 1; j >= 0; --j) {
+      ptrdiff_t ind[] = {array_ind, j};
+      ptrdiff_t offset;
+      if (j == axis) {
+        offset = axis_ind;
+      } else {
+        offset = _ind.get()[j];
+      }
+      ptr += x_strides[ind] * offset;
+    }
+
+    y = *reinterpret_cast<T*>(ptr);
+    ''',
+    'cupy_concatenate',
+    reduce_dims=False
+)
--- a/cupy/_core/_routines_math.pxd
+++ b/cupy/_core/_routines_math.pxd
+from cupy._core.core cimport _ndarray_base
+
+
+cdef _ndarray_base _ndarray_conj(_ndarray_base self)
+cdef _ndarray_base _ndarray_real_getter(_ndarray_base self)
+cdef _ndarray_base _ndarray_real_setter(_ndarray_base self, value)
+cdef _ndarray_base _ndarray_imag_getter(_ndarray_base self)
+cdef _ndarray_base _ndarray_imag_setter(_ndarray_base self, value)
+cdef _ndarray_base _ndarray_prod(
+    _ndarray_base self, axis, dtype, out, keepdims)
+cdef _ndarray_base _ndarray_sum(_ndarray_base self, axis, dtype, out, keepdims)
+cdef _ndarray_base _ndarray_cumsum(_ndarray_base self, axis, dtype, out)
+cdef _ndarray_base _ndarray_cumprod(_ndarray_base self, axis, dtype, out)
+cdef _ndarray_base _ndarray_clip(_ndarray_base self, a_min, a_max, out)
+
+cpdef _ndarray_base _nansum(_ndarray_base a, axis, dtype, out, keepdims)
+cpdef _ndarray_base _nanprod(_ndarray_base a, axis, dtype, out, keepdims)
+
+cpdef enum scan_op:
+    SCAN_SUM = 0
+    SCAN_PROD = 1
+
+cdef _ndarray_base scan(_ndarray_base a, op, dtype=*, _ndarray_base out=*,
+                        incomplete=*, chunk_size=*)
+cdef object _sum_auto_dtype
+cdef object _add
+cdef object _conj
+cdef object _angle
+cdef object _positive
+cdef object _negative
+cdef object _multiply
+cdef object _divide
+cdef object _power
+cdef object _subtract
+cdef object _true_divide
+cdef object _floor_divide
+cdef object _remainder
+cdef object _absolute
+cdef object _sqrt
--- a/cupy/_core/_routines_math.pyx
+++ b/cupy/_core/_routines_math.pyx
+import string
+
+import numpy
+
+import cupy
+from cupy._core._reduction import create_reduction_func
+from cupy._core._kernel import create_ufunc, _get_warpsize
+from cupy._core._scalar import get_typename
+from cupy._core._ufuncs import elementwise_copy
+import cupy._core.core as core
+from cupy._core cimport internal
+from cupy import _util
+
+from cupy_backends.cuda.api cimport runtime
+from cupy._core cimport _accelerator
+from cupy._core._dtype cimport get_dtype
+from cupy._core.core cimport _ndarray_init
+from cupy._core.core cimport compile_with_cache
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport memory
+
+from cupy.cuda import cub
+
+try:
+    import cupy_backends.cuda.libs.cutensor as cuda_cutensor
+except ImportError:
+    cuda_cutensor = None
+
+
+# _ndarray_base members
+
+
+cdef _ndarray_base _ndarray_conj(_ndarray_base self):
+    if self.dtype.kind == 'c':
+        return _conjugate(self)
+    else:
+        return self
+
+
+cdef _ndarray_base _ndarray_real_getter(_ndarray_base self):
+    if self.dtype.kind == 'c':
+        dtype = get_dtype(self.dtype.char.lower())
+        view = core.ndarray.__new__(
+            type(self), shape=self._shape, dtype=dtype, _obj=self,
+            memptr=self.data, strides=self._strides)
+        (<_ndarray_base>view).base = (
+            self.base if self.base is not None else self)
+        return view
+    return self
+
+
+cdef _ndarray_base _ndarray_real_setter(_ndarray_base self, value):
+    elementwise_copy(value, _ndarray_real_getter(self))
+
+
+cdef _ndarray_base _ndarray_imag_getter(_ndarray_base self):
+    cdef memory.MemoryPointer memptr
+    if self.dtype.kind == 'c':
+        dtype = get_dtype(self.dtype.char.lower())
+        memptr = self.data
+        # Make the memory pointer point to the first imaginary element.
+        # Note that even if the array doesn't have a valid memory (e.g. 0-size
+        # array), the resulting array should be a view of the original array,
+        # aligning with NumPy behavior.
+        if memptr.ptr != 0:
+            memptr = memptr + self.dtype.itemsize // 2
+        view = core.ndarray.__new__(
+            type(self), shape=self._shape, dtype=dtype, memptr=memptr,
+            strides=self._strides)
+        (<_ndarray_base>view).base = (
+            self.base if self.base is not None else self)
+        return view
+    new_array = core.ndarray.__new__(type(self), self.shape, dtype=self.dtype)
+    new_array.fill(0)
+    return new_array
+
+
+cdef _ndarray_base _ndarray_imag_setter(_ndarray_base self, value):
+    if self.dtype.kind == 'c':
+        elementwise_copy(value, _ndarray_imag_getter(self))
+    else:
+        raise TypeError('cupy.ndarray does not have imaginary part to set')
+
+
+cdef _ndarray_base _ndarray_prod(
+        _ndarray_base self, axis, dtype, out, keepdims):
+    for accelerator in _accelerator._routine_accelerators:
+        result = None
+        if accelerator == _accelerator.ACCELERATOR_CUB:
+            # result will be None if the reduction is not compatible with CUB
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_PROD, axis, dtype, out, keepdims)
+        if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
+                cuda_cutensor is not None):
+            from cupyx import cutensor
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_MUL, 1, 0)
+        if result is not None:
+            return result
+    if dtype is None:
+        return _prod_auto_dtype(self, axis, dtype, out, keepdims)
+    else:
+        return _prod_keep_dtype(self, axis, dtype, out, keepdims)
+
+
+cdef _ndarray_base _ndarray_sum(
+        _ndarray_base self, axis, dtype, out, keepdims):
+    for accelerator in _accelerator._routine_accelerators:
+        result = None
+        if accelerator == _accelerator.ACCELERATOR_CUB:
+            # result will be None if the reduction is not compatible with CUB
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_SUM, axis, dtype, out, keepdims)
+        if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
+                cuda_cutensor is not None):
+            from cupyx import cutensor
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_ADD, 1, 0)
+        if result is not None:
+            return result
+
+    if dtype is None:
+        return _sum_auto_dtype(self, axis, dtype, out, keepdims)
+    else:
+        return _sum_keep_dtype(self, axis, dtype, out, keepdims)
+
+
+cdef _ndarray_base _ndarray_cumsum(_ndarray_base self, axis, dtype, out):
+    return cupy.cumsum(self, axis, dtype, out)
+
+
+cdef _ndarray_base _ndarray_cumprod(_ndarray_base self, axis, dtype, out):
+    return cupy.cumprod(self, axis, dtype, out)
+
+
+cdef _ndarray_base _ndarray_clip(_ndarray_base self, a_min, a_max, out):
+    if a_min is None and a_max is None:
+        raise ValueError('array_clip: must set either max or min')
+    kind = self.dtype.kind
+    if a_min is None:
+        if kind == 'f':
+            a_min = self.dtype.type('-inf')
+        elif kind in 'iu':
+            a_min = numpy.iinfo(self.dtype.type).min
+    if a_max is None:
+        if kind == 'f':
+            a_max = self.dtype.type('inf')
+        elif kind in 'iu':
+            a_max = numpy.iinfo(self.dtype.type).max
+    return _clip(self, a_min, a_max, out=out)
+
+
+# private/internal
+
+_op_char = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'}
+_identity = {scan_op.SCAN_SUM: 0, scan_op.SCAN_PROD: 1}
+
+
+@cupy._util.memoize(for_each_device=True)
+def _cupy_bsum_shfl(op, chunk_size, warp_size=32):
+    """Returns a kernel that computes the sum/prod of each thread-block.
+
+    Args:
+        op (int): Operation type. SCAN_SUM or SCAN_PROD.
+        chunk_size (int): Number of array elements processed by a single
+            thread-block.
+        warp_size (int); Warp size.
+
+    Returns:
+        cupy.ElementwiseKernel
+
+    Example:
+        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        _cupy_bsum(op=SCAN_SUM, chunk_size=4)(a, b, ...)
+        b == [10, 26, 19]
+
+    Note:
+        This uses warp shuffle functions to exchange data in a warp.
+        See the link below for details about warp shuffle functions.
+        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
+    """
+    block_size = chunk_size // 2  # each thread handles two elements
+    in_params = 'raw T a'
+    out_params = 'raw O b'
+    loop_prep = string.Template("""
+        __shared__ O smem[${block_size} / ${warp_size}];
+        const int n_warp = ${block_size} / ${warp_size};
+        const int warp_id = threadIdx.x / ${warp_size};
+        const int lane_id = threadIdx.x % ${warp_size};
+    """).substitute(block_size=block_size, warp_size=warp_size)
+    loop_body = string.Template("""
+        O x = ${identity};
+        if (2*i < a.size()) x = a[2*i];
+        if (2*i + 1 < a.size()) x ${op}= a[2*i + 1];
+        for (int j = 1; j < ${warp_size}; j *= 2) {
+            x ${op}= __shfl_xor_sync(0xffffffff, x, j, ${warp_size});
+        }
+        if (lane_id == 0) smem[warp_id] = x;
+        __syncthreads();
+        if (warp_id == 0) {
+            x = ${identity};
+            if (lane_id < n_warp) x = smem[lane_id];
+            for (int j = 1; j < n_warp; j *= 2) {
+                x ${op}= __shfl_xor_sync(0xffffffff, x, j, ${warp_size});
+            }
+            int block_id = i / ${block_size};
+            if (lane_id == 0) b[block_id] = x;
+        }
+    """).substitute(block_size=block_size, warp_size=warp_size,
+                    op=_op_char[op], identity=_identity[op])
+    return cupy.ElementwiseKernel(in_params, out_params, loop_body,
+                                  'cupy_bsum_shfl', loop_prep=loop_prep)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _cupy_bsum_smem(op, chunk_size, warp_size=32):
+    """Returns a kernel that computes the sum/prod of each thread-block.
+
+    Args:
+        op (int): Operation type. SCAN_SUM or SCAN_PROD.
+        chunk_size (int): Number of array elements processed by a single
+            thread-block.
+        warp_size (int); Warp size.
+
+    Returns:
+        cupy.ElementwiseKernel
+
+    Example:
+        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        _cupy_bsum(op=SCAN_SUM, chunk_size=4)(a, b, ...)
+        b == [10, 26, 19]
+
+    Note:
+        This uses shared memory to exchange data in a warp.
+    """
+    block_size = chunk_size // 2  # each thread handles two elements
+    in_params = 'raw T a'
+    out_params = 'raw O b'
+    loop_prep = string.Template("""
+        __shared__ O smem1[${block_size}];
+        __shared__ O smem2[${warp_size}];
+        const int n_warp = ${block_size} / ${warp_size};
+        const int warp_id = threadIdx.x / ${warp_size};
+        const int lane_id = threadIdx.x % ${warp_size};
+    """).substitute(block_size=block_size, warp_size=warp_size)
+    loop_body = string.Template("""
+        O x = ${identity};
+        if (2*i < a.size()) x = a[2*i];
+        if (2*i + 1 < a.size()) x ${op}= a[2*i + 1];
+        for (int j = 1; j < ${warp_size}; j *= 2) {
+            smem1[threadIdx.x] = x;          __syncwarp();
+            x ${op}= smem1[threadIdx.x ^ j]; __syncwarp();
+        }
+        if (lane_id == 0) smem2[warp_id] = x;
+        __syncthreads();
+        if (warp_id == 0) {
+            x = ${identity};
+            if (lane_id < n_warp) x = smem2[lane_id];
+            for (int j = 1; j < n_warp; j *= 2) {
+                smem2[lane_id] = x;          __syncwarp();
+                x ${op}= smem2[lane_id ^ j]; __syncwarp();
+            }
+            int block_id = i / ${block_size};
+            if (lane_id == 0) b[block_id] = x;
+        }
+    """).substitute(block_size=block_size, warp_size=warp_size,
+                    op=_op_char[op], identity=_identity[op])
+    return cupy.ElementwiseKernel(in_params, out_params, loop_body,
+                                  'cupy_bsum_smem', loop_prep=loop_prep)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _cupy_scan_naive(op, chunk_size, warp_size=32):
+    """Returns a kernel to compute an inclusive scan.
+
+    It first performs an inclusive scan in each thread-block and then add the
+    scan results for the sum/prod of the chunks.
+
+    Args:
+        op (int): Operation type. SCAN_SUM or SCAN_PROD.
+        chunk_size (int): Number of array elements processed by a single
+            thread-block.
+        warp_size (int); Warp size.
+
+    Returns:
+        cupy.ElementwiseKernel
+
+    Example:
+        b = [10, 36, 55]
+        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        _cupy_scan(op=SCAN_SUM, chunk_size=4)(b, a, out, ...)
+        out == [1, 3, 6, 10, 15, 21, 28, 36, 45, 55]
+
+    Note:
+        This uses a kind of method called "Naive Parallel Scan" for inclusive
+        scan in each thread-block. See below for details about it.
+        https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
+    """
+    in_params = 'raw O b'
+    out_params = 'raw T a, raw O out'
+    loop_prep = string.Template("""
+        __shared__ O smem1[${block_size}];
+        __shared__ O smem2[${warp_size}];
+        const int n_warp = ${block_size} / ${warp_size};
+        const int warp_id = threadIdx.x / ${warp_size};
+        const int lane_id = threadIdx.x % ${warp_size};
+    """).substitute(block_size=chunk_size, warp_size=warp_size)
+    loop_body = string.Template("""
+        O x = ${identity};
+        if (i < a.size()) x = a[i];
+        for (int j = 1; j < ${warp_size}; j *= 2) {
+            smem1[threadIdx.x] = x;  __syncwarp();
+            if (lane_id - j >= 0) x ${op}= smem1[threadIdx.x - j];
+            __syncwarp();
+        }
+        if (lane_id == ${warp_size} - 1) smem2[warp_id] = x;
+        __syncthreads();
+        if (warp_id == 0) {
+            O y = ${identity};
+            if (lane_id < n_warp) y = smem2[lane_id];
+            for (int j = 1; j < n_warp; j *= 2) {
+                smem2[lane_id] = y;  __syncwarp();
+                if (lane_id - j >= 0) y ${op}= smem2[lane_id - j];
+                __syncwarp();
+            }
+            smem2[lane_id] = y;
+        }
+        __syncthreads();
+        if (warp_id > 0) x ${op}= smem2[warp_id - 1];
+        int block_id = i / ${block_size};
+        if (block_id > 0) x ${op}= b[block_id - 1];
+        if (i < a.size()) out[i] = x;
+    """).substitute(block_size=chunk_size, warp_size=warp_size,
+                    op=_op_char[op], identity=_identity[op])
+    return cupy.ElementwiseKernel(in_params, out_params, loop_body,
+                                  'cupy_scan_naive', loop_prep=loop_prep)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _cupy_scan_btree(op, chunk_size, warp_size=32):
+    """Returns a kernel to compute an inclusive scan.
+
+    It first performs an inclusive scan in each thread-block and then add the
+    scan results for the sum/prod of the chunks.
+
+    Args:
+        op (int): Operation type. SCAN_SUM or SCAN_PROD.
+        chunk_size (int): Number of array elements processed by a single
+            thread-block.
+        warp_size (int); Warp size.
+
+    Returns:
+        cupy.ElementwiseKernel
+
+    Example:
+        b = [10, 36, 55]
+        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        _cupy_scan(op=SCAN_SUM, chunk_size=4)(b, a, out, ...)
+        out == [1, 3, 6, 10, 15, 21, 28, 36, 45, 55]
+
+    Note:
+        This uses a kind of method called "Work-Efficient Parallel Scan" for
+        inclusive scan in each thread-block. See below link for details about
+        "Work-Efficient Parallel Scan".
+        https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
+    """
+    in_params = 'raw O b'
+    out_params = 'raw T a, raw O out'
+    loop_prep = string.Template("""
+        __shared__ O smem0[${block_size} + 1];
+        O *smem1 = smem0 + 1;
+        __shared__ O smem2[${warp_size}];
+        const int n_warp = ${block_size} / ${warp_size};
+        const int warp_id = threadIdx.x / ${warp_size};
+        const int lane_id = threadIdx.x % ${warp_size};
+        if (threadIdx.x == 0) smem0[0] = ${identity};
+    """).substitute(block_size=chunk_size, warp_size=warp_size,
+                    identity=_identity[op])
+    loop_body = string.Template("""
+        O x = ${identity};
+        if (i < a.size()) x = a[i];
+        for (int j = 1; j < ${warp_size}; j *= 2) {
+            smem1[threadIdx.x] = x;  __syncwarp();
+            if (lane_id % (2*j) == (2*j)-1) {
+                x ${op}= smem1[threadIdx.x - j];
+            }
+            __syncwarp();
+        }
+        smem1[threadIdx.x] = x;
+        __syncthreads();
+        if (warp_id == 0) {
+            O y = ${identity};
+            if (lane_id < n_warp) {
+                y = smem0[${warp_size} * (lane_id + 1)];
+            }
+            for (int j = 1; j < n_warp; j *= 2) {
+                smem2[lane_id] = y;  __syncwarp();
+                if (lane_id % (2*j) == (2*j)-1) {
+                    y ${op}= smem2[lane_id - j];
+                }
+                __syncwarp();
+            }
+            for (int j = n_warp / 4; j > 0; j /= 2) {
+                smem2[lane_id] = y; __syncwarp();
+                if ((lane_id % (2*j) == j-1) && (lane_id >= 2*j)) {
+                    y ${op}= smem2[lane_id - j];
+                }
+                __syncwarp();
+            }
+            if (lane_id < n_warp) {
+                smem0[${warp_size} * (lane_id + 1)] = y;
+            }
+        }
+        __syncthreads();
+        x = smem0[threadIdx.x];
+        for (int j = ${warp_size} / 2; j > 0; j /= 2) {
+            if (lane_id % (2*j) == j) {
+                x ${op}= smem0[threadIdx.x - j];
+            }
+            __syncwarp();
+            smem0[threadIdx.x] = x;  __syncwarp();
+        }
+        __syncthreads();
+        x = smem1[threadIdx.x];
+        int block_id = i / ${block_size};
+        if (block_id > 0) x ${op}= b[block_id - 1];
+        if (i < a.size()) out[i] = x;
+    """).substitute(block_size=chunk_size, warp_size=warp_size,
+                    op=_op_char[op], identity=_identity[op])
+    return cupy.ElementwiseKernel(in_params, out_params, loop_body,
+                                  'cupy_scan_btree', loop_prep=loop_prep)
+
+
+cdef _ndarray_base scan(
+        _ndarray_base a, op, dtype=None, _ndarray_base out=None,
+        incomplete=False, chunk_size=512):
+    """Return the prefix sum(scan) of the elements.
+
+    Args:
+        a (cupy.ndarray): input array.
+        out (cupy.ndarray): Alternative output array in which to place
+            the result. The same size and same type as the input array(a).
+
+    Returns:
+        cupy.ndarray: A new array holding the result is returned.
+
+    """
+    if a._shape.size() != 1:
+        raise TypeError('Input array should be 1D array.')
+
+    if out is None:
+        if dtype is None:
+            dtype = a.dtype
+        if not incomplete:
+            out = _ndarray_init(cupy.ndarray, a._shape, dtype, None)
+    else:
+        if a.size != out.size:
+            raise ValueError('Provided out is the wrong size')
+        dtype = out.dtype
+    dtype = numpy.dtype(dtype)
+
+    warp_size = _get_warpsize()
+    if runtime._is_hip_environment:
+        if dtype.char in 'iIfdlq':
+            # On HIP, __shfl* supports int, unsigned int, float, double,
+            # long, and long long. The documentation is too outdated and
+            # unreliable; refer to the header at
+            # $ROCM_HOME/include/hip/hcc_detail/device_functions.h
+            bsum_kernel = _cupy_bsum_shfl(op, chunk_size, warp_size)
+        else:
+            bsum_kernel = _cupy_bsum_smem(op, chunk_size, warp_size)
+    else:
+        if dtype.char in 'iIlLqQfd':
+            bsum_kernel = _cupy_bsum_shfl(op, chunk_size, warp_size)
+        else:
+            bsum_kernel = _cupy_bsum_smem(op, chunk_size, warp_size)
+    if dtype.char in 'fdFD':
+        scan_kernel = _cupy_scan_btree(op, chunk_size, warp_size)
+    else:
+        scan_kernel = _cupy_scan_naive(op, chunk_size, warp_size)
+    b_size = (a.size + chunk_size - 1) // chunk_size
+    b = cupy.empty((b_size,), dtype=dtype)
+    size = b.size * chunk_size
+
+    if a.size > chunk_size:
+        bsum_kernel(a, b, size=size // 2, block_size=chunk_size // 2)
+        scan(b, op, dtype=dtype, out=b)
+        if incomplete:
+            return b
+        scan_kernel(b, a, out, size=size, block_size=chunk_size)
+    else:
+        scan_kernel(b, a, out, size=size, block_size=chunk_size)
+
+    return out
+
+
+@_util.memoize(for_each_device=True)
+def _inclusive_batch_scan_kernel(
+        dtype, block_size, op, src_c_cont, out_c_cont):
+    """return Prefix Sum(Scan) cuda kernel
+    for a 2d array over axis 1
+    used for scanning over different axes
+
+    e.g
+    if blocksize > len(src[0])
+    src [[1, 2, 3, 4],
+         [5, 6, 7, 8]]
+    dst [[1, 3, 6, 10],
+         [5, 11, 18, 26]]
+
+    if blocksize < len(src[0])
+    block_size: 2
+    # TODO show partialness
+    src [[1, 2, 3, 4],
+         [5, 6, 7, 8]]
+    dst [[1, 3, 3, 7],
+         [5, 11, 7, 15]]
+
+    Args:
+        dtype: src, dst array type
+        block_size: block_size
+
+    Returns:
+         cupy.cuda.Function: cuda function
+    """
+    op_char = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'}
+    identity = {scan_op.SCAN_SUM: 0, scan_op.SCAN_PROD: 1}
+    name = 'cupy_inclusive_batch_scan_kernel'
+    dtype = get_typename(dtype)
+    source = string.Template("""
+    extern "C" __global__ void ${name}(
+        const CArray<${dtype}, 2, ${src_c_cont}> src,
+        CArray<${dtype}, 2, ${out_c_cont}> dst, int batch_size){
+        long long n = src.size();
+
+        extern __shared__ ${dtype} temp[];
+
+        unsigned int thid = threadIdx.x;
+        unsigned int block = blockIdx.x * blockDim.x;
+
+        unsigned int pad_batch_size = batch_size;
+        bool must_copy = true;
+
+        if (batch_size & (batch_size -1)) {
+            pad_batch_size = 1 << (32 - __clz(batch_size));
+            must_copy = (thid & (pad_batch_size-1)) < batch_size;
+        }
+        if (pad_batch_size > ${block_size}) {
+            int blocks_per_batch = (batch_size - 1) / ${block_size} + 1;
+            pad_batch_size = ${block_size} * blocks_per_batch;
+
+            // Must copy enables for all blocks but the last one in the batch
+            bool last_block = (blockIdx.x + 1) % blocks_per_batch == 0;
+            int remaining_batch = batch_size % ${block_size};
+            if (remaining_batch == 0) {
+                remaining_batch = ${block_size};
+            }
+            must_copy = !last_block || (thid < (remaining_batch));
+        }
+
+        int pad_per_batch = pad_batch_size-batch_size;
+        int n_batches_block = ${block_size} / pad_batch_size;
+
+        unsigned int idx0 = thid + block;
+
+        int batch_id = idx0 / pad_batch_size;
+        idx0 = idx0 - pad_per_batch * batch_id;
+
+        int row = idx0 / batch_size;
+        int col = idx0 % batch_size;
+        const ptrdiff_t idx0_idx[] = {row, col};
+
+        if(idx0 < n){
+            temp[thid] = (must_copy) ? src[idx0_idx] : (${dtype}) ${identity};
+            __syncthreads();
+            if (!n_batches_block) {
+                n_batches_block = 1;
+                pad_batch_size = ${block_size};
+            }
+            for (int j = 0; j < n_batches_block; j++) {
+                int offset = j * pad_batch_size;
+                for (int i = 1; i <= pad_batch_size; i <<= 1) {
+                    int index = ((threadIdx.x + 1) * 2 * i - 1);
+                    int index_block = offset + index;
+                    if (index < (pad_batch_size)){
+                        temp[index_block] ${op}= temp[index_block - i];
+                    }
+                    __syncthreads();
+                }
+                for(int i = pad_batch_size >> 1; i > 0; i >>= 1){
+                    int index = ((threadIdx.x + 1) * 2 * i - 1);
+                    int index_block = offset + index;
+                    if((index + i) < (pad_batch_size)){
+                        temp[index_block + i] ${op}= temp[index_block];
+                    }
+                    __syncthreads();
+                }
+            }
+            if(must_copy){
+                dst[idx0_idx] = temp[thid];
+            }
+        }
+    }
+    """).substitute(name=name, dtype=dtype, block_size=block_size,
+                    op=op_char[op], identity=identity[op],
+                    src_c_cont=src_c_cont, out_c_cont=out_c_cont)
+    module = compile_with_cache(source)
+    return module.get_function(name)
+
+
+@_util.memoize(for_each_device=True)
+def _add_scan_batch_blocked_sum_kernel(dtype, op, block_size, c_cont):
+    name = 'cupy_add_scan_blocked_sum_kernel'
+    dtype = get_typename(dtype)
+    ops = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'}
+    source = string.Template("""
+    extern "C" __global__ void ${name}(CArray<${dtype}, 2, ${c_cont}> src_dst,
+        int batch_size){
+        long long n = src_dst.size();
+
+        unsigned int thid = threadIdx.x;
+        unsigned int block = blockIdx.x * ${block_size};
+
+        unsigned int idx0 = thid + block;
+
+        // Respect padding
+        unsigned int row = idx0 / batch_size;
+        unsigned int col = idx0 % batch_size;
+        int my_block = ${block_size} * (col / ${block_size});
+        const ptrdiff_t dst_idx[] = {row, col};
+        const ptrdiff_t src_idx[] = {row, my_block - 1};
+
+        // Avoid for the first block of every row
+        // This can be tweaked with kernel launch settings
+        bool first = col < ${block_size};
+        bool is_block = (col % (${block_size})) == ${block_size} - 1;
+        if(idx0 < n && !first && !is_block){
+            src_dst[dst_idx] ${op}= src_dst[src_idx];
+        }
+    }
+    """).substitute(name=name, dtype=dtype, op=ops[op], block_size=block_size,
+                    c_cont=c_cont)
+    module = compile_with_cache(source)
+    return module.get_function(name)
+
+
+cdef _ndarray_base _batch_scan_op(
+        _ndarray_base a, scan_op op, _ndarray_base out):
+    batch_size = a.shape[1]
+    # TODO(ecastill) replace this with "_reduction._block_size" once it is
+    # properly exposed
+    block_size = 512
+    # Since we need to pad each batch we spawn more threads as some
+    # of them will be idle
+    # Calc the total number of blocks
+    padded_bs = 1 << ((batch_size - 1).bit_length())
+    if padded_bs > block_size:
+        blocks_per_batch = (batch_size - 1) // block_size + 1
+        padded_bs = block_size * blocks_per_batch
+    padded_size = a.size // batch_size * padded_bs
+
+    cdef int src_cont = int(a.flags.c_contiguous)
+    cdef int out_cont = int(out.flags.c_contiguous)
+    kern_scan = _inclusive_batch_scan_kernel(a.dtype, block_size, op,
+                                             src_cont, out_cont)
+    kern_scan(grid=((padded_size - 1) // (block_size) + 1,),
+              block=(block_size,),
+              args=(a, out, batch_size),
+              shared_mem=a.itemsize * block_size)
+    if batch_size > block_size:
+        blocked_sum = out[:, block_size-1::block_size]
+        _batch_scan_op(blocked_sum, op, blocked_sum)
+        kern_add = _add_scan_batch_blocked_sum_kernel(
+            out.dtype, op, block_size, out_cont)
+        kern_add(
+            grid=((out.size - 1) // (block_size) + 1,),
+            block=(block_size,),
+            args=(out, batch_size))
+    return out
+
+
+cdef _proc_as_batch(_ndarray_base x, int axis, scan_op op):
+    if x.shape[axis] == 0:
+        return cupy.empty_like(x)
+    t = cupy.rollaxis(x, axis, x.ndim)
+    s = t.shape
+    r = t.reshape(-1, x.shape[axis])
+    _batch_scan_op(r, op, r)
+    return cupy.rollaxis(r.reshape(s), x.ndim-1, axis)
+
+
+cpdef scan_core(
+        _ndarray_base a, axis, scan_op op, dtype=None, _ndarray_base out=None):
+    if out is None:
+        if dtype is None:
+            kind = a.dtype.kind
+            if kind == 'b':
+                dtype = numpy.dtype('l')
+            elif kind == 'i' and a.dtype.itemsize < numpy.dtype('l').itemsize:
+                dtype = numpy.dtype('l')
+            elif kind == 'u' and a.dtype.itemsize < numpy.dtype('L').itemsize:
+                dtype = numpy.dtype('L')
+            else:
+                dtype = a.dtype
+        result = None
+    else:
+        if (out.flags.c_contiguous or out.flags.f_contiguous):
+            result = out
+            elementwise_copy(a, result)
+        else:
+            result = a.astype(out.dtype, order='C')
+
+    if axis is None:
+        for accelerator in _accelerator._routine_accelerators:
+            if accelerator == _accelerator.ACCELERATOR_CUB:
+                if result is None:
+                    result = a.astype(dtype, order='C').ravel()
+                # result will be None if the scan is not compatible with CUB
+                if op == scan_op.SCAN_SUM:
+                    cub_op = cub.CUPY_CUB_CUMSUM
+                else:
+                    cub_op = cub.CUPY_CUB_CUMPROD
+                res = cub.cub_scan(result, cub_op)
+                if res is not None:
+                    break
+        else:
+            if result is None:
+                result = scan(a.ravel(), op, dtype=dtype)
+            else:
+                scan(result, op, dtype=dtype, out=result)
+    else:
+        if result is None:
+            result = a.astype(dtype, order='C')
+        axis = internal._normalize_axis_index(axis, a.ndim)
+        result = _proc_as_batch(result, axis, op)
+    # This is for when the original out param was not contiguous
+    if out is not None and out.data != result.data:
+        elementwise_copy(result.reshape(out.shape), out)
+    else:
+        out = result
+    return out
+
+
+# Only for test
+def _scan_for_test(a, out=None):
+    return scan(a, scan_op.SCAN_SUM, dtype=None, out=out)
+
+
+cpdef _ndarray_base _nansum(_ndarray_base a, axis, dtype, out, keepdims):
+    if cupy.iscomplexobj(a):
+        return _nansum_complex_dtype(a, axis, dtype, out, keepdims)
+    elif dtype is None:
+        return _nansum_auto_dtype(a, axis, dtype, out, keepdims)
+    else:
+        return _nansum_keep_dtype(a, axis, dtype, out, keepdims)
+
+
+cpdef _ndarray_base _nanprod(_ndarray_base a, axis, dtype, out, keepdims):
+    if cupy.iscomplexobj(a):
+        return _nanprod_complex_dtype(a, axis, dtype, out, keepdims)
+    elif dtype is None:
+        return _nanprod_auto_dtype(a, axis, dtype, out, keepdims)
+    else:
+        return _nanprod_keep_dtype(a, axis, dtype, out, keepdims)
+
+
+_sum_auto_dtype = create_reduction_func(
+    'cupy_sum',
+    ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('in0', 'a + b', 'out0 = type_out0_raw(a)', None), 0)
+
+
+_sum_keep_dtype = create_reduction_func(
+    'cupy_sum_with_dtype',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('in0', 'a + b', 'out0 = type_out0_raw(a)', None), 0)
+
+
+_nansum_auto_dtype = create_reduction_func(
+    'cupy_nansum',
+    ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('(in0 == in0) ? in0 : type_in0_raw(0)',
+     'a + b', 'out0 = type_out0_raw(a)', None), 0)
+
+
+_nansum_keep_dtype = create_reduction_func(
+    'cupy_nansum_with_dtype',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('(in0 == in0) ? in0 : type_in0_raw(0)',
+     'a + b', 'out0 = type_out0_raw(a)', None), 0)
+
+
+_nansum_complex_dtype = create_reduction_func(
+    'cupy_nansum_complex_dtype',
+    ('F->F', 'D->D'),
+    ('''
+    type_in0_raw((in0.real() == in0.real()) ? in0.real() : 0,
+                 (in0.imag() == in0.imag()) ? in0.imag() : 0)
+    ''',
+     'a + b', 'out0 = type_out0_raw(a)', None), 0)
+
+
+_prod_auto_dtype = create_reduction_func(
+    'cupy_prod',
+    ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('in0', 'a * b', 'out0 = type_out0_raw(a)', None), 1)
+
+
+_prod_keep_dtype = create_reduction_func(
+    'cupy_prod_with_dtype',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('in0', 'a * b', 'out0 = type_out0_raw(a)', None), 1)
+
+
+_nanprod_auto_dtype = create_reduction_func(
+    'cupy_nanprod',
+    ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('(in0 == in0) ? in0 : type_in0_raw(1)',
+     'a * b', 'out0 = type_out0_raw(a)', None), 1)
+
+
+_nanprod_keep_dtype = create_reduction_func(
+    'cupy_nanprod_with_dtype',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('(in0 == in0) ? in0 : type_in0_raw(1)',
+     'a * b', 'out0 = type_out0_raw(a)', None), 1)
+
+
+_nanprod_complex_dtype = create_reduction_func(
+    'cupy_nanprod_complex_dtype',
+    ('F->F', 'D->D'),
+    ('''
+    type_in0_raw((in0.real() == in0.real()) ? in0.real() : 1,
+                 (in0.imag() == in0.imag()) ? in0.imag() : 1)
+    ''',
+     'a * b', 'out0 = type_out0_raw(a)', None), 1)
+
+cdef create_arithmetic(
+        name, op, boolop, doc, cutensor_op=None, scatter_op=None):
+    # boolop is either
+    #  - str (the operator for bool-bool inputs) or
+    #  - callable (a function to raise an error for bool-bool inputs).
+    if isinstance(boolop, str):
+        boolop = 'out0 = in0 %s in1' % boolop
+
+    return create_ufunc(
+        'cupy_' + name,
+        (('??->?', boolop),
+         'bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
+         'LL->L', 'qq->q', 'QQ->Q', 'ee->e', 'ff->f', 'dd->d', 'FF->F',
+         'DD->D'),
+        'out0 = in0 %s in1' % op,
+        doc=doc,
+        cutensor_op=cutensor_op,
+        scatter_op=scatter_op)
+
+
+_add = create_arithmetic(
+    'add', '+', '|',
+    '''Adds two arrays elementwise.
+
+    .. seealso:: :data:`numpy.add`
+
+    ''',
+    cutensor_op=('OP_ADD', 1, 1), scatter_op='add')
+
+
+_conjugate = create_ufunc(
+    'cupy_conjugate',
+    ('b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', 'q->q',
+     'Q->Q', 'e->e', 'f->f', 'd->d',
+     ('F->F', 'out0 = conj(in0)'),
+     ('D->D', 'out0 = conj(in0)')),
+    'out0 = in0',
+    doc='''Returns the complex conjugate, element-wise.
+
+    .. seealso:: :data:`numpy.conjugate`
+
+    ''')
+
+
+_angle = create_ufunc(
+    'cupy_angle',
+    ('?->d', 'e->e', 'f->f', 'd->d',
+     ('F->f', 'out0 = arg(in0)'),
+     ('D->d', 'out0 = arg(in0)')),
+    'out0 = in0 >= 0 ? 0 : M_PI',
+    doc='''Returns the angle of the complex argument.
+
+    .. seealso:: :func:`numpy.angle`
+
+    ''')
+
+
+_angle_deg = create_ufunc(
+    'cupy_angle_deg',
+    ('?->d', 'e->e', 'f->f', 'd->d',
+     ('F->f', 'out0 = arg(in0) * (180.0 / M_PI)'),
+     ('D->d', 'out0 = arg(in0) * (180.0 / M_PI)')),
+    'out0 = in0 >= 0 ? 0 : 180.0',
+    doc='''Returns the angle of the complex argument.
+
+    .. seealso:: :func:`numpy.angle`
+
+    ''')
+
+
+def _positive_boolean_error():
+    raise TypeError(
+        'The cupy boolean positive, the `+` operator, is not supported.')
+
+
+_positive = create_ufunc(
+    'cupy_positive',
+    (('?->?', _positive_boolean_error),
+     'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
+    'out0 = +in0',
+    doc='''Takes numerical positive elementwise.
+
+    .. seealso:: :data:`numpy.positive`
+
+    ''')
+
+
+def _negative_boolean_error():
+    raise TypeError(
+        'The cupy boolean negative, the `-` operator, is not supported, '
+        'use the `~` operator or the logical_not function instead.')
+
+
+_negative = create_ufunc(
+    'cupy_negative',
+    (('?->?', _negative_boolean_error),
+     'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
+    'out0 = -in0',
+    doc='''Takes numerical negative elementwise.
+
+    .. seealso:: :data:`numpy.negative`
+
+    ''')
+
+
+_multiply = create_arithmetic(
+    'multiply', '*', '&',
+    '''Multiplies two arrays elementwise.
+
+    .. seealso:: :data:`numpy.multiply`
+
+    ''',
+    cutensor_op=('OP_MUL', 1, 1))
+
+
+# `integral_power` should return somewhat appropriate values for negative
+# integral powers (for which NumPy would raise errors). Hence the branches in
+# the beginning. This behavior is not officially documented and could change.
+cdef _power_preamble = '''
+template <typename T>
+inline __device__ T integral_power(T in0, T in1) {
+    if (in1 < 0) {
+        if (in0 == -1) {return (in1 & 1) ? -1 : 1;}
+        else {return (in0 == 1) ? 1 : 0;}
+    }
+    T out0 = 1;
+    while (in1 > 0) {
+        if (in1 & 1) {
+            out0 *= in0;
+        }
+        in0 *= in0;
+        in1 >>= 1;
+    }
+    return out0;
+}
+
+template <typename T>
+inline __device__ T complex_power(T in0, T in1) {
+    return in1 == T(0) ? T(1): pow(in0, in1);
+}
+'''
+
+_power = create_ufunc(
+    'cupy_power',
+    ('??->b', 'bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l',
+     'LL->L', 'qq->q', 'QQ->Q',
+     ('ee->e', 'out0 = powf(in0, in1)'),
+     ('ff->f', 'out0 = powf(in0, in1)'),
+     ('dd->d', 'out0 = pow(in0, in1)'),
+     ('FF->F', 'out0 = complex_power(in0, in1)'),
+     ('DD->D', 'out0 = complex_power(in0, in1)')),
+    'out0 = integral_power(in0, in1)',
+    preamble=_power_preamble,
+    doc='''Computes ``x1 ** x2`` elementwise.
+
+    .. seealso:: :data:`numpy.power`
+
+    ''')
+
+
+def _subtract_boolean_error():
+    raise TypeError(
+        'cupy boolean subtract, the `-` operator, is deprecated, use the '
+        'bitwise_xor, the `^` operator, or the logical_xor function instead.')
+
+
+_subtract = create_arithmetic(
+    'subtract', '-', _subtract_boolean_error,
+    '''Subtracts arguments elementwise.
+
+    .. seealso:: :data:`numpy.subtract`
+
+    ''',
+    cutensor_op=('OP_ADD', 1, -1), scatter_op='sub')
+
+
+_true_divide = create_ufunc(
+    'cupy_true_divide',
+    ('bb->d', 'BB->d', 'hh->d', 'HH->d', 'ii->d', 'II->d', 'll->d', 'LL->d',
+     'qq->d', 'QQ->d', 'ee->e', 'ff->f', 'dd->d', 'FF->F', 'DD->D'),
+    'out0 = (out0_type)in0 / (out0_type)in1',
+    doc='''Elementwise true division (i.e. division as floating values).
+
+    .. seealso:: :data:`numpy.true_divide`
+
+    ''',
+    out_ops=('ee->e', 'ff->f', 'dd->d', 'FF->F', 'DD->D'),
+)
+
+
+_divide = _true_divide
+
+
+_floor_divide = create_ufunc(
+    'cupy_floor_divide',
+    ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', 'LL->L',
+     'qq->q', 'QQ->Q', 'ee->e', 'ff->f', 'dd->d'),
+    'out0 = _floor_divide(in0, in1)',
+    doc='''Elementwise floor division (i.e. integer quotient).
+
+    .. seealso:: :data:`numpy.floor_divide`
+
+    ''')
+
+
+_remainder = create_ufunc(
+    'cupy_remainder',
+    ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', 'LL->L',
+     'qq->q', 'QQ->Q',
+     ('ee->e', 'out0 = in0 - _floor_divide(in0, in1) * in1'),
+     ('ff->f', 'out0 = in0 - _floor_divide(in0, in1) * in1'),
+     ('dd->d', 'out0 = in0 - _floor_divide(in0, in1) * in1')),
+    'out0 = (in0 - _floor_divide(in0, in1) * in1) * (in1 != 0)',
+    doc='''Computes the remainder of Python division elementwise.
+
+    .. seealso:: :data:`numpy.remainder`
+
+    ''')
+
+
+_absolute = create_ufunc(
+    'cupy_absolute',
+    (('?->?', 'out0 = in0'),
+     'b->b', ('B->B', 'out0 = in0'), 'h->h', ('H->H', 'out0 = in0'),
+     'i->i', ('I->I', 'out0 = in0'), 'l->l', ('L->L', 'out0 = in0'),
+     'q->q', ('Q->Q', 'out0 = in0'),
+     ('e->e', 'out0 = fabsf(in0)'),
+     ('f->f', 'out0 = fabsf(in0)'),
+     ('d->d', 'out0 = fabs(in0)'),
+     ('F->f', 'out0 = abs(in0)'),
+     ('D->d', 'out0 = abs(in0)')),
+    'out0 = in0 > 0 ? in0 : -in0',
+    doc='''Elementwise absolute value function.
+
+    .. seealso:: :data:`numpy.absolute`
+
+    ''')
+
+
+_sqrt = create_ufunc(
+    'cupy_sqrt',
+    ('e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
+    'out0 = sqrt(in0)',
+    doc='''Elementwise square root function.
+
+    .. seealso:: :data:`numpy.sqrt`
+
+    ''')
+
+
+_clip = create_ufunc(
+    'cupy_clip',
+    ('???->?', 'bbb->b', 'BBB->B', 'hhh->h', 'HHH->H', 'iii->i', 'III->I',
+     'lll->l', 'LLL->L', 'qqq->q', 'QQQ->Q', 'eee->e', 'fff->f', 'ddd->d'),
+    'out0 = in1 > in2 ? in2 : (in0 < in1 ? in1 : (in0 > in2 ? in2 : in0))')
+
+
+# Variables to expose to Python
+# (cythonized data cannot be exposed to Python, even with cpdef.)
+
+
+add = _add
+conjugate = _conjugate
+angle = _angle
+angle_deg = _angle_deg
+positive = _positive
+negative = _negative
+multiply = _multiply
+divide = _divide
+power = _power
+subtract = _subtract
+true_divide = _true_divide
+floor_divide = _floor_divide
+remainder = _remainder
+absolute = _absolute
+sqrt = _sqrt
+
+sum_auto_dtype = _sum_auto_dtype  # used from cupy/math/sumprod.py
+nansum_auto_dtype = _nansum_auto_dtype  # used from cupy/math/sumprod.py
+prod_auto_dtype = _prod_auto_dtype  # used from cupy/math/sumprod.py
+nanprod_auto_dtype = _nanprod_auto_dtype  # used from cupy/math/sumprod.py
+clip = _clip  # used from cupy/math/misc.py
--- a/cupy/_core/_routines_sorting.pxd
+++ b/cupy/_core/_routines_sorting.pxd
+from cupy._core.core cimport _ndarray_base
+
+
+cdef _ndarray_sort(_ndarray_base self, int axis)
+cdef _ndarray_base _ndarray_argsort(_ndarray_base self, axis)
+cdef _ndarray_partition(_ndarray_base self, kth, int axis)
+cdef _ndarray_base _ndarray_argpartition(self, kth, axis)
--- a/cupy/_core/_routines_sorting.pyx
+++ b/cupy/_core/_routines_sorting.pyx
+import string
+
+import numpy
+
+import cupy
+from cupy._core._scalar import get_typename as _get_typename
+from cupy._core._ufuncs import elementwise_copy
+import cupy._core.core as core
+from cupy import _util
+from cupy.cuda import thrust
+
+from cupy._core cimport _routines_manipulation as _manipulation
+from cupy._core.core cimport compile_with_cache
+from cupy._core.core cimport _ndarray_base
+from cupy._core cimport internal
+
+
+cdef _ndarray_sort(_ndarray_base self, int axis):
+    cdef int ndim = self._shape.size()
+    cdef _ndarray_base data
+
+    if not cupy.cuda.thrust.available:
+        raise RuntimeError('Thrust is needed to use cupy.sort. Please '
+                           'install CUDA Toolkit with Thrust then '
+                           'reinstall CuPy after uninstalling it.')
+
+    if ndim == 0:
+        raise numpy.AxisError('Sorting arrays with the rank of zero is not '
+                              'supported')  # as numpy.sort() raises
+
+    # TODO(takagi): Support sorting views
+    if not self._c_contiguous:
+        raise NotImplementedError('Sorting non-contiguous array is not '
+                                  'supported.')
+
+    axis = internal._normalize_axis_index(axis, ndim)
+
+    if axis == ndim - 1:
+        data = self
+    else:
+        data = _manipulation.rollaxis(self, axis, ndim).copy()
+
+    if ndim == 1:
+        thrust.sort(self.dtype, data.data.ptr, 0, self.shape)
+    else:
+        max_size = max(min(1 << 22, data.size) // data.shape[-1], 1)
+        keys_array = core.ndarray(
+            (max_size * data.shape[-1],), dtype=numpy.intp)
+        stop = data.size // data.shape[-1]
+        for offset in range(0, stop, max_size):
+            width = min(max_size, stop - offset)
+            thrust.sort(
+                self.dtype,
+                data.data.ptr + offset * data.shape[-1] * data.itemsize,
+                keys_array.data.ptr,
+                (width, data.shape[-1]),
+            )
+
+    if axis == ndim - 1:
+        pass
+    else:
+        data = _manipulation.rollaxis(data, -1, axis)
+        elementwise_copy(data, self)
+
+
+cdef _ndarray_base _ndarray_argsort(_ndarray_base self, axis):
+    cdef int _axis, ndim
+    cdef _ndarray_base data
+
+    if not cupy.cuda.thrust.available:
+        raise RuntimeError('Thrust is needed to use cupy.argsort. Please '
+                           'install CUDA Toolkit with Thrust then '
+                           'reinstall CuPy after uninstalling it.')
+
+    self = cupy.atleast_1d(self)
+    ndim = self._shape.size()
+
+    if axis is None:
+        data = self.ravel()
+        _axis = -1
+    else:
+        data = self
+        _axis = axis
+
+    _axis = internal._normalize_axis_index(_axis, ndim)
+
+    if _axis == ndim - 1:
+        data = data.copy()
+    else:
+        data = _manipulation.rollaxis(data, _axis, ndim).copy()
+    shape = data.shape
+
+    idx_array = core.ndarray(shape, dtype=numpy.intp)
+
+    if ndim == 1:
+        thrust.argsort(self.dtype, idx_array.data.ptr, data.data.ptr, 0,
+                       shape)
+    else:
+        keys_array = core.ndarray(shape, dtype=numpy.intp)
+        thrust.argsort(self.dtype, idx_array.data.ptr, data.data.ptr,
+                       keys_array.data.ptr, shape)
+
+    if _axis == ndim - 1:
+        return idx_array
+    else:
+        return _manipulation.rollaxis(idx_array, -1, _axis)
+
+
+cdef _ndarray_partition(_ndarray_base self, kth, int axis):
+    """Partitions an array.
+
+    Args:
+        kth (int or sequence of ints): Element index to partition by. If
+            supplied with a sequence of k-th it will partition all elements
+            indexed by k-th of them into their sorted position at once.
+
+        axis (int): Axis along which to sort. Default is -1, which means
+            sort along the last axis.
+
+    .. seealso::
+        :func:`cupy.partition` for full documentation,
+        :meth:`numpy.ndarray.partition`
+
+    """
+
+    cdef int ndim = self._shape.size()
+    cdef Py_ssize_t k, max_k, length, s, sz, t
+    cdef _ndarray_base data
+
+    if ndim == 0:
+        raise numpy.AxisError('Sorting arrays with the rank of zero is not '
+                              'supported')
+
+    if not self._c_contiguous:
+        raise NotImplementedError('Sorting non-contiguous array is not '
+                                  'supported.')
+
+    axis = internal._normalize_axis_index(axis, ndim)
+
+    if axis == ndim - 1:
+        data = self
+    else:
+        data = _manipulation.rollaxis(self, axis, ndim).copy()
+
+    length = self._shape[axis]
+    if isinstance(kth, int):
+        kth = kth,
+    max_k = 0
+    for k in kth:
+        if k < 0:
+            k += length
+        if not (0 <= k < length):
+            raise ValueError('kth(={}) out of bounds {}'.format(k, length))
+        if max_k < k:
+            max_k = k
+
+    # For simplicity, max_k is round up to the power of 2. If max_k is
+    # already the power of 2, it is round up to the next power of 2 because
+    # we need to collect the first max(kth)+1 elements.
+    max_k = max(32, 1 << max_k.bit_length())
+
+    # The parameter t is the length of the list that stores elements to be
+    # selected for each thread. We divide the array into sz subarrays.
+    # These parameters are determined from the measurement on TITAN X.
+    t = 4
+    sz = 512
+    while sz > 0 and length // sz < max_k + 32 * t:
+        sz //= 2
+    sz *= self.size // length
+
+    # If the array size is small or k is large, we simply sort the array.
+    if length < 32 or sz <= 32 or max_k >= 1024:
+        # kth is ignored.
+        data.sort(axis=-1)
+    else:
+        shape = data.shape
+        data = data.ravel()
+
+        # For each subarray, we collect first k elements to the head.
+        kern, merge_kern = _partition_kernel(self.dtype)
+        block_size = 32
+        grid_size = sz
+        kern(grid=(grid_size,), block=(block_size,), args=(
+            data, max_k, self.size, t, sz))
+
+        # Merge heads of subarrays.
+        s = 1
+        while s < sz // (self.size // length):
+            block_size = 32
+            grid_size = sz // s // 2
+            merge_kern(grid=(grid_size,), block=(block_size,), args=(
+                data, max_k, self.size, sz, s))
+            s *= 2
+
+        data = data.reshape(shape)
+
+    if axis != ndim - 1:
+        data = _manipulation.rollaxis(data, -1, axis)
+        elementwise_copy(data, self)
+
+
+cdef _ndarray_base _ndarray_argpartition(self, kth, axis):
+    """Returns the indices that would partially sort an array.
+
+    Args:
+        kth (int or sequence of ints): Element index to partition by. If
+            supplied with a sequence of k-th it will partition all elements
+            indexed by k-th of them into their sorted position at once.
+        axis (int or None): Axis along which to sort. Default is -1, which
+            means sort along the last axis. If None is supplied, the array
+            is flattened before sorting.
+
+    Returns:
+        cupy.ndarray: Array of the same type and shape as ``a``.
+
+    .. seealso::
+        :func:`cupy.argpartition` for full documentation,
+        :meth:`numpy.ndarray.argpartition`
+
+    """
+    cdef int _axis, ndim
+    cdef Py_ssize_t k, max_k, length, s, sz, t
+    cdef _ndarray_base data
+    if axis is None:
+        data = self.ravel()
+        _axis = -1
+    else:
+        data = self
+        _axis = axis
+
+    ndim = data._shape.size()
+    _axis = internal._normalize_axis_index(_axis, ndim)
+
+    if _axis != ndim - 1:
+        data = _manipulation.rollaxis(self, _axis, ndim).copy()
+
+    length = data._shape[ndim - 1]
+
+    if length == 0:
+        return cupy.empty((0,), dtype=cupy.int64)
+
+    if isinstance(kth, int):
+        kth = kth,
+    max_k = 0
+    for k in kth:
+        if k < 0:
+            k += length
+        if not (0 <= k < length):
+            raise ValueError('kth(={}) out of bounds {}'.format(k, length))
+        if max_k < k:
+            max_k = k
+
+    # For simplicity, max_k is round up to the power of 2. If max_k is
+    # already the power of 2, it is round up to the next power of 2 because
+    # we need to collect the first max(kth)+1 elements.
+    max_k = max(32, 1 << max_k.bit_length())
+
+    # The parameter t is the length of the list that stores elements to be
+    # selected for each thread. We divide the array into sz subarrays.
+    # These parameters are determined from the measurement on TITAN X.
+    t = 4
+    sz = 512
+    while sz > 0 and length // sz < max_k + 32 * t:
+        sz //= 2
+    sz *= self.size // length
+    shape = data.shape
+
+    # If the array size is small or k is large, we simply sort the array.
+    if length < 32 or sz < 1 or max_k >= 1024:
+        # kth is ignored.
+        indices = data.argsort(axis=-1)
+    else:
+        data = data.ravel()
+        indices = cupy.arange(0, data.shape[0], dtype=cupy.int64)
+
+        # For each subarray, we collect first k elements to the head.
+        kern, merge_kern = _argpartition_kernel(self.dtype)
+        block_size = 32
+        grid_size = sz
+        kern(grid=(grid_size,), block=(block_size,), args=(
+            data, indices, max_k, self.size, t, sz))
+
+        # Merge heads of subarrays.
+        s = 1
+        while s < sz // (self.size // length):
+            block_size = 32
+            grid_size = sz // s // 2
+            merge_kern(grid=(grid_size,), block=(block_size,), args=(
+                data, indices, max_k, self.size, sz, s))
+            s *= 2
+
+    # Rearrange indices w.r.t the original axis
+    axis_indices = cupy.unravel_index(indices, shape)
+    indices = axis_indices[-1]
+    indices = indices.reshape(shape)
+
+    if _axis != ndim - 1:
+        indices = _manipulation.rollaxis(indices, -1, _axis)
+
+    return indices
+
+
+@_util.memoize(for_each_device=True)
+def _partition_kernel(dtype):
+    name = 'partition_kernel'
+    merge_kernel = 'partition_merge_kernel'
+    dtype = _get_typename(dtype)
+    source = string.Template('''
+    template<typename T>
+    __device__ void bitonic_sort_step(CArray<T, 1, true> a,
+            ptrdiff_t x, ptrdiff_t y, int i, ptrdiff_t s, ptrdiff_t w) {
+        for (ptrdiff_t j = i; j < (y - x) / 2; j += 32) {
+            ptrdiff_t n = j + (j & -w);
+            T v = a[n + x], u = a[n + w + x];
+            if (n & s ? v < u : v > u) {
+                a[n + x] = u;
+                a[n + w + x] = v;
+            }
+        }
+    }
+
+    // Sort a[x:y].
+    template<typename T>
+    __device__ void bitonic_sort(
+            CArray<T, 1, true> a, ptrdiff_t x, ptrdiff_t y, int i) {
+        for (ptrdiff_t s = 2; s <= y - x; s *= 2) {
+            for (ptrdiff_t w = s / 2; w >= 1; w /= 2) {
+                bitonic_sort_step< T >(a, x, y, i, s, w);
+            }
+        }
+    }
+
+    // Merge first k elements and the next 32 times t elements.
+    template<typename T>
+    __device__ void merge(
+            CArray<T, 1, true> a,
+            int k, int i, ptrdiff_t x, ptrdiff_t z, int u) {
+        for (int s = i; s < u; s += 32) {
+            if (a[x + k - s - 1] > a[z + s]) {
+                T tmp = a[x + k - s - 1];
+                a[x + k - s - 1] = a[z + s];
+                a[z + s] = tmp;
+            }
+        }
+
+        // After merge step, the first k elements are already bitonic.
+        // Therefore, we do not need to fully sort.
+        for (int w = k / 2; w >= 1; w /= 2) {
+            bitonic_sort_step< T >(a, x, k + x, i, k, w);
+        }
+    }
+
+    extern "C" {
+    // In this function, 32 threads handle one subarray. This number equals to
+    // the warp size. The first k elements are always sorted and the next 32
+    // times t elements stored values that have possibilities to be selected.
+    __global__ void ${name}(
+            CArray<${dtype}, 1, true> a,
+            int k, ptrdiff_t n, int t, ptrdiff_t sz) {
+
+        // This thread handles a[z:m].
+        ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+            + threadIdx.x;
+        ptrdiff_t z = i / 32 * n / sz;
+        ptrdiff_t m = (i / 32 + 1) * n / sz;
+        int id = i % 32;
+        int x = 0;
+
+        bitonic_sort< ${dtype} >(a, z, k + z, id);
+        ptrdiff_t j;
+        for (j = k + id + z; j < m - (m - z) % 32; j += 32) {
+            if (a[j] < a[k - 1 + z]) {
+                ${dtype} tmp = a[k + 32 * x + id + z];
+                a[k + 32 * x + id + z] = a[j];
+                a[j] = tmp;
+                ++x;
+            }
+
+            // If at least one thread in the warp has found t values that
+            // can be selected, we update the first k elements.
+    #if __CUDACC_VER_MAJOR__ >= 9
+            if (__any_sync(0xffffffff, x >= t)) {
+    #else
+            if (__any(x >= t)) {
+    #endif
+                bitonic_sort< ${dtype} >(a, k + z, 32 * t + k + z, id);
+                merge< ${dtype} >(a, k, id, z, k + z, min(k, 32 * t));
+                x = 0;
+            }
+        }
+        if (j < m && a[j] < a[k - 1 + z]) {
+            ${dtype} tmp = a[k + 32 * x + id + z];
+            a[k + 32 * x + id + z] = a[j];
+            a[j] = tmp;
+        }
+
+        // Finally, we merge the first k elements and the remainders to be
+        // stored.
+        bitonic_sort< ${dtype} >(a, k + z, 32 * t + k + z, id);
+        merge< ${dtype} >(a, k, id, z, k + z, min(k, 32 * t));
+    }
+
+    __global__ void ${merge_kernel}(
+            CArray<${dtype}, 1, true> a, int k, ptrdiff_t n, int sz, int s) {
+        ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+            + threadIdx.x;
+        ptrdiff_t z = i / 32 * 2 * s * n / sz;
+        ptrdiff_t m = (i / 32 * 2 + 1) * s * n / sz;
+        int id = i % 32;
+        merge< ${dtype} >(a, k, id, z, m, k);
+    }
+    }
+    ''').substitute(name=name, merge_kernel=merge_kernel, dtype=dtype)
+    module = compile_with_cache(source)
+    return module.get_function(name), module.get_function(merge_kernel)
+
+
+@_util.memoize(for_each_device=True)
+def _argpartition_kernel(dtype):
+    name = 'argpartition_kernel'
+    merge_kernel = 'argpartition_merge_kernel'
+    dtype = _get_typename(dtype)
+    source = string.Template('''
+    template<typename T>
+    __device__ void bitonic_sort_step(
+            CArray<T, 1, true> a, CArray<long long, 1, true> b,
+            ptrdiff_t x, ptrdiff_t y, int i, ptrdiff_t s, ptrdiff_t w) {
+        for (ptrdiff_t j = i; j < (y - x) / 2; j += 32) {
+            ptrdiff_t n = j + (j & -w);
+            T v = a[b[n + x]], u = a[b[n + w + x]];
+            if (n & s ? v < u : v > u) {
+                long long temp = b[n + x];
+                b[n + x] = b[n + w + x];
+                b[n + w + x] = temp;
+            }
+        }
+    }
+
+    // Sort a[x:y].
+    template<typename T>
+    __device__ void bitonic_sort(
+            CArray<T, 1, true> a, CArray<long long, 1, true> b,
+            ptrdiff_t x, ptrdiff_t y, int i) {
+        for (ptrdiff_t s = 2; s <= y - x; s *= 2) {
+            for (ptrdiff_t w = s / 2; w >= 1; w /= 2) {
+                bitonic_sort_step< T >(a, b, x, y, i, s, w);
+            }
+        }
+    }
+
+    // Merge first k elements and the next 32 times t elements.
+    template<typename T>
+    __device__ void merge(
+            CArray<T, 1, true> a, CArray<long long, 1, true> b,
+            int k, int i, ptrdiff_t x, ptrdiff_t z, int u) {
+        for (int s = i; s < u; s += 32) {
+            if (a[b[x + k - s - 1]] > a[b[z + s]]) {
+                long long tmp = b[x + k - s - 1];
+                b[x + k - s - 1] = b[z + s];
+                b[z + s] = tmp;
+            }
+        }
+
+        // After merge step, the first k elements are already bitonic.
+        // Therefore, we do not need to fully sort.
+        for (int w = k / 2; w >= 1; w /= 2) {
+            bitonic_sort_step< T >(a, b, x, k + x, i, k, w);
+        }
+    }
+
+    extern "C" {
+    // In this function, 32 threads handle one subarray. This number equals to
+    // the warp size. The first k elements are always sorted and the next 32
+    // times t elements stored values that have possibilities to be selected.
+    __global__ void ${name}(
+            CArray<${dtype}, 1, true> a, CArray<long long, 1, true> b,
+            int k, ptrdiff_t n, int t, ptrdiff_t sz) {
+
+        // This thread handles a[z:m].
+        ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+            + threadIdx.x;
+        ptrdiff_t z = i / 32 * n / sz;
+        ptrdiff_t m = (i / 32 + 1) * n / sz;
+        int id = i % 32;
+        int x = 0;
+
+        bitonic_sort< ${dtype} >(a, b, z, k + z, id);
+        ptrdiff_t j;
+        for (j = k + id + z; j < m - (m - z) % 32; j += 32) {
+            if (a[b[j]] < a[b[k - 1 + z]]) {
+                long long tmp = b[k + 32 * x + id + z];
+                b[k + 32 * x + id + z] = b[j];
+                b[j] = tmp;
+                ++x;
+            }
+
+            // If at least one thread in the warp has found t values that
+            // can be selected, we update the first k elements.
+    #if __CUDACC_VER_MAJOR__ >= 9
+            if (__any_sync(0xffffffff, x >= t)) {
+    #else
+            if (__any(x >= t)) {
+    #endif
+                bitonic_sort< ${dtype} >(a, b, k + z, 32 * t + k + z, id);
+                merge< ${dtype} >(a, b, k, id, z, k + z, min(k, 32 * t));
+                x = 0;
+            }
+        }
+        if (j < m && a[b[j]] < a[b[k - 1 + z]]) {
+            long long tmp = b[k + 32 * x + id + z];
+            b[k + 32 * x + id + z] = b[j];
+            b[j] = tmp;
+        }
+
+        // Finally, we merge the first k elements and the remainders to be
+        // stored.
+        bitonic_sort< ${dtype} >(a, b, k + z, 32 * t + k + z, id);
+        merge< ${dtype} >(a, b, k, id, z, k + z, min(k, 32 * t));
+    }
+
+    __global__ void ${merge_kernel}(
+            CArray<${dtype}, 1, true> a,  CArray<long long, 1, true> b,
+            int k, ptrdiff_t n, int sz, int s) {
+        ptrdiff_t i = static_cast<ptrdiff_t>(blockIdx.x) * blockDim.x
+            + threadIdx.x;
+        ptrdiff_t z = i / 32 * 2 * s * n / sz;
+        ptrdiff_t m = (i / 32 * 2 + 1) * s * n / sz;
+        int id = i % 32;
+        merge< ${dtype} >(a, b, k, id, z, m, k);
+    }
+    }
+    ''').substitute(name=name, merge_kernel=merge_kernel, dtype=dtype)
+    module = compile_with_cache(source)
+    return module.get_function(name), module.get_function(merge_kernel)
--- a/cupy/_core/_routines_statistics.pxd
+++ b/cupy/_core/_routines_statistics.pxd
+from cupy._core.core cimport _ndarray_base
+
+
+# TODO(niboshi): Move {nan,}arg{min,max} to sorting
+
+
+cdef _ndarray_base _ndarray_max(_ndarray_base self, axis, out, dtype, keepdims)
+cdef _ndarray_base _ndarray_min(_ndarray_base self, axis, out, dtype, keepdims)
+cdef _ndarray_base _ndarray_ptp(_ndarray_base self, axis, out, keepdims)
+cdef _ndarray_base _ndarray_argmax(
+    _ndarray_base self, axis, out, dtype, keepdims)
+cdef _ndarray_base _ndarray_argmin(
+    _ndarray_base self, axis, out, dtype, keepdims)
+cdef _ndarray_base _ndarray_mean(
+    _ndarray_base self, axis, dtype, out, keepdims)
+cdef _ndarray_base _ndarray_var(
+    _ndarray_base self, axis, dtype, out, ddof, keepdims)
+cdef _ndarray_base _ndarray_std(
+    _ndarray_base self, axis, dtype, out, ddof, keepdims)
+
+cpdef _ndarray_base _median(
+    _ndarray_base a, axis, out, overwrite_input, keepdims)
+
+cpdef _ndarray_base _nanmean(_ndarray_base a, axis, dtype, out, keepdims)
+cpdef _ndarray_base _nanvar(_ndarray_base a, axis, dtype, out, ddof, keepdims)
+cpdef _ndarray_base _nanstd(_ndarray_base a, axis, dtype, out, ddof, keepdims)
+
+
+cpdef _ndarray_base _nanargmin(_ndarray_base a, axis, out, dtype, keepdims)
+cpdef _ndarray_base _nanargmax(_ndarray_base a, axis, out, dtype, keepdims)
--- a/cupy/_core/_routines_statistics.pyx
+++ b/cupy/_core/_routines_statistics.pyx
+from cpython cimport sequence
+
+import numpy
+from numpy import nan
+
+import cupy
+from cupy._core import _reduction
+from cupy._core._reduction import create_reduction_func
+from cupy._core._reduction import ReductionKernel
+from cupy._core._kernel import ElementwiseKernel
+from cupy._core._ufuncs import elementwise_copy
+
+from cupy._core cimport _accelerator
+from cupy._core cimport _routines_math as _math
+from cupy._core.core cimport _ndarray_base
+
+from cupy.cuda import cub
+
+try:
+    import cupy_backends.cuda.libs.cutensor as cuda_cutensor
+except ImportError:
+    cuda_cutensor = None
+
+
+cdef _ndarray_base _ndarray_max(
+        _ndarray_base self, axis, out, dtype, keepdims):
+    for accelerator in _accelerator._routine_accelerators:
+        result = None
+        if accelerator == _accelerator.ACCELERATOR_CUB:
+            # result will be None if the reduction is not compatible with CUB
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_MAX, axis, dtype, out, keepdims)
+        if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
+                cuda_cutensor is not None):
+            from cupyx import cutensor
+            if self.dtype.kind == 'c' or dtype in ('F', 'D'):
+                # Complex dtype is not supported
+                continue
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_MAX, 1, 0)
+        if result is not None:
+            return result
+    return _amax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
+
+
+cdef _ndarray_base _ndarray_min(
+        _ndarray_base self, axis, out, dtype, keepdims):
+    for accelerator in _accelerator._routine_accelerators:
+        result = None
+        if accelerator == _accelerator.ACCELERATOR_CUB:
+            # result will be None if the reduction is not compatible with CUB
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_MIN, axis, out, dtype, keepdims)
+        if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
+                cuda_cutensor is not None):
+            from cupyx import cutensor
+            if self.dtype.kind == 'c' or dtype in ('F', 'D'):
+                # Complex dtype is not supported
+                continue
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_MIN, 1, 0)
+        if result is not None:
+            return result
+    return _amin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
+
+
+cdef _ndarray_base _ndarray_ptp(_ndarray_base self, axis, out, keepdims):
+    for accelerator in _accelerator._routine_accelerators:
+        if accelerator == _accelerator.ACCELERATOR_CUB:
+            # result will be None if the reduction is not compatible with CUB
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_MAX, axis, out, None, keepdims)
+            if result is not None:
+                result -= cub.cub_reduction(
+                    self, cub.CUPY_CUB_MIN, axis, None, None, keepdims)
+                return result
+        if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
+                cuda_cutensor is not None):
+            from cupyx import cutensor
+            if self.dtype.kind == 'c':
+                # Complex dtype is not supported
+                continue
+            maxv = cutensor._try_reduction_routine(
+                self, axis, None, out, keepdims, cuda_cutensor.OP_MAX, 1, 0)
+            if maxv is None:
+                continue
+            return cutensor._try_reduction_routine(
+                self, axis, None, maxv, keepdims, cuda_cutensor.OP_MIN, -1, 1)
+
+    result = _amax(self, axis=axis, out=out, keepdims=keepdims)
+    result -= _amin(self, axis=axis, out=None, keepdims=keepdims)
+    return result
+
+
+# TODO(leofang): this signature is incompatible with NumPy!
+cdef _ndarray_base _ndarray_argmax(
+        _ndarray_base self, axis, out, dtype, keepdims):
+    for accelerator in _accelerator._routine_accelerators:
+        if accelerator == _accelerator.ACCELERATOR_CUB:
+            # result will be None if the reduction is not compatible with CUB
+            if self._f_contiguous and self.dtype == numpy.bool_:
+                # temporary workaround casting the inputs to int8
+                # CUB argmax seems to return different values to
+                # NumPy for F-order bool array inputs
+                self = self.astype(numpy.int8)
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_ARGMAX, axis, dtype, out, keepdims)
+            if result is not None:
+                return result
+    return _argmax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
+
+
+# TODO(leofang): this signature is incompatible with NumPy!
+cdef _ndarray_base _ndarray_argmin(
+        _ndarray_base self, axis, out, dtype, keepdims):
+    for accelerator in _accelerator._routine_accelerators:
+        if accelerator == _accelerator.ACCELERATOR_CUB:
+            # result will be None if the reduction is not compatible with CUB
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_ARGMIN, axis, dtype, out, keepdims)
+            if result is not None:
+                return result
+    return _argmin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
+
+
+cdef _ndarray_base _ndarray_mean(
+        _ndarray_base self, axis, dtype, out, keepdims):
+    cdef Py_ssize_t n
+
+    dtype_sum = dtype_out = dtype
+    if dtype is None:
+        if self.dtype.kind in 'iub':
+            dtype_out = numpy.float64
+            dtype_sum = numpy.float64
+        elif self.dtype.char == 'e':
+            dtype_sum = numpy.float32
+            dtype_out = numpy.float16
+    elif numpy.dtype(dtype).kind in 'iub':
+        # output will be the requested type, but compute the mean using float
+        dtype_out = dtype
+        dtype_sum = numpy.float64
+
+    for accelerator in _accelerator._routine_accelerators:
+        if accelerator == _accelerator.ACCELERATOR_CUB and self.size != 0:
+            result = cub.cub_reduction(
+                self, cub.CUPY_CUB_SUM, axis, dtype_sum, out, keepdims)
+            if result is not None:
+                n = self.size // result.size
+                cupy.true_divide(result, n, out=result, casting='unsafe')
+                break
+        if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and
+                cuda_cutensor is not None):
+            from cupyx import cutensor
+            reduce_axis, _ = _reduction._get_axis(axis, self._shape.size())
+            n = 1
+            for i in reduce_axis:
+                n *= self._shape[i]
+            n = max(n, 1)
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype_sum, out, keepdims,
+                cuda_cutensor.OP_ADD, 1.0 / n, 0)
+            if result is not None:
+                break
+    else:
+        result = _mean(
+            self, axis=axis, dtype=dtype_sum, out=out, keepdims=keepdims)
+
+    if dtype_out is not None and out is None:
+        result = result.astype(dtype_out)
+    return result
+
+
+cdef _ndarray_base _ndarray_var(
+        _ndarray_base self, axis, dtype, out, ddof, keepdims):
+    return _var(
+        self, axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims)
+
+
+cdef _ndarray_base _ndarray_std(
+        _ndarray_base self, axis, dtype, out, ddof, keepdims):
+    return _std(
+        self, axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims)
+
+
+cdef _min_max_preamble = '''
+template <typename T>
+struct min_max_st{
+    T value;
+    int index;
+    __device__ min_max_st() : index(-1) { }
+    __device__ min_max_st(T v) : value(v), index(0) { }
+    __device__ min_max_st(T v, int i) : value(v), index(i) { }
+};
+
+template <typename T>
+__device__ min_max_st<T> my_min(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    return min_max_st<T>(min(a.value, b.value));
+}
+template <typename T>
+__device__ min_max_st<T> my_min_float(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    if (isnan(a.value)) return a;
+    if (isnan(b.value)) return b;
+    return min_max_st<T>(min(a.value, b.value));
+}
+
+template <typename T>
+__device__ min_max_st<T> my_max(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    return min_max_st<T>(max(a.value, b.value));
+}
+template <typename T>
+__device__ min_max_st<T> my_max_float(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    if (isnan(a.value)) return a;
+    if (isnan(b.value)) return b;
+    return min_max_st<T>(max(a.value, b.value));
+}
+
+template <typename T>
+__device__ min_max_st<T> my_argmin(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    if (a.value == b.value)
+        return min_max_st<T>(a.value, min(a.index, b.index));
+    return (a.value <= b.value) ? a : b;
+}
+template <typename T>
+__device__ min_max_st<T> my_argmin_float(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    if (a.value == b.value)
+        return min_max_st<T>(a.value, min(a.index, b.index));
+    if (isnan(a.value)) return a;
+    if (isnan(b.value)) return b;
+    return (a.value <= b.value) ? a : b;
+}
+
+template <typename T>
+__device__ min_max_st<T> my_argmax(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    if (a.value == b.value)
+        return min_max_st<T>(a.value, min(a.index, b.index));
+    return (a.value >= b.value) ? a : b;
+}
+template <typename T>
+__device__ min_max_st<T> my_argmax_float(
+        const min_max_st<T>& a, const min_max_st<T>& b) {
+    if (a.index == -1) return b;
+    if (b.index == -1) return a;
+    if (a.value == b.value)
+        return min_max_st<T>(a.value, min(a.index, b.index));
+    if (isnan(a.value)) return a;
+    if (isnan(b.value)) return b;
+    return (a.value >= b.value) ? a : b;
+}
+
+'''
+
+
+cdef _amin = create_reduction_func(
+    'cupy_min',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, 'my_min_float(a, b)', None, None)),
+     ('f->f', (None, 'my_min_float(a, b)', None, None)),
+     ('d->d', (None, 'my_min_float(a, b)', None, None)),
+     ('F->F', (None, 'my_min_float(a, b)', None, None)),
+     ('D->D', (None, 'my_min_float(a, b)', None, None))),
+    ('min_max_st<type_in0_raw>(in0)', 'my_min(a, b)', 'out0 = a.value',
+     'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble)
+
+
+cdef _amax = create_reduction_func(
+    'cupy_max',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q',
+     ('e->e', (None, 'my_max_float(a, b)', None, None)),
+     ('f->f', (None, 'my_max_float(a, b)', None, None)),
+     ('d->d', (None, 'my_max_float(a, b)', None, None)),
+     ('F->F', (None, 'my_max_float(a, b)', None, None)),
+     ('D->D', (None, 'my_max_float(a, b)', None, None)),
+     ),
+    ('min_max_st<type_in0_raw>(in0)', 'my_max(a, b)', 'out0 = a.value',
+     'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble)
+
+
+nanmin = create_reduction_func(
+    'cupy_nanmin',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
+    ('min_max_st<type_in0_raw>(in0)', 'my_min(a, b)', 'out0 = a.value',
+     'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble)
+
+
+nanmax = create_reduction_func(
+    'cupy_nanmax',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
+    ('min_max_st<type_in0_raw>(in0)', 'my_max(a, b)', 'out0 = a.value',
+     'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble)
+
+
+cdef _argmin = create_reduction_func(
+    'cupy_argmin',
+    tuple(['{}->{}'.format(d, r) for r in 'qlihb' for d in '?BhHiIlLqQ'])
+    + (
+        ('e->q', (None, 'my_argmin_float(a, b)', None, None)),
+        ('f->q', (None, 'my_argmin_float(a, b)', None, None)),
+        ('d->q', (None, 'my_argmin_float(a, b)', None, None)),
+        ('F->q', (None, 'my_argmin_float(a, b)', None, None)),
+        ('D->q', (None, 'my_argmin_float(a, b)', None, None))),
+    ('min_max_st<type_in0_raw>(in0, _J)', 'my_argmin(a, b)', 'out0 = a.index',
+     'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble, sort_reduce_axis=False)
+
+
+cdef _argmax = create_reduction_func(
+    'cupy_argmax',
+    tuple(['{}->{}'.format(d, r) for r in 'qlihb' for d in '?BhHiIlLqQ'])
+    + (
+        ('e->q', (None, 'my_argmax_float(a, b)', None, None)),
+        ('f->q', (None, 'my_argmax_float(a, b)', None, None)),
+        ('d->q', (None, 'my_argmax_float(a, b)', None, None)),
+        ('F->q', (None, 'my_argmax_float(a, b)', None, None)),
+        ('D->q', (None, 'my_argmax_float(a, b)', None, None))),
+    ('min_max_st<type_in0_raw>(in0, _J)', 'my_argmax(a, b)', 'out0 = a.index',
+     'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble, sort_reduce_axis=False)
+
+
+cpdef _ndarray_base _nanargmax(_ndarray_base a, axis, out, dtype, keepdims):
+    return _nanargmax_func(
+        a, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
+
+
+cpdef _ndarray_base _nanargmin(_ndarray_base a, axis, out, dtype, keepdims):
+    return _nanargmin_func(
+        a, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
+
+
+cdef _nanargmin_func = create_reduction_func(
+    'cupy_nanargmin',
+    ('?->q', 'B->q', 'h->q', 'H->q', 'i->q', 'I->q', 'l->q', 'L->q',
+     'q->q', 'Q->q',
+     ('e->q', (None, 'my_argmin_float(a, b)', None, None)),
+     ('f->q', (None, 'my_argmin_float(a, b)', None, None)),
+     ('d->q', (None, 'my_argmin_float(a, b)', None, None)),
+     ('F->q', (None, 'my_argmin_float(a, b)', None, None)),
+     ('D->q', (None, 'my_argmin_float(a, b)', None, None))),
+    ('min_max_st<type_in0_raw>(in0, isnan(in0) ? -1 : _J)',
+     'my_argmin(a, b)', 'out0 = a.index', 'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble, sort_reduce_axis=False)
+
+
+cdef _nanargmax_func = create_reduction_func(
+    'cupy_nanargmax',
+    ('?->q', 'B->q', 'h->q', 'H->q', 'i->q', 'I->q', 'l->q', 'L->q',
+     'q->q', 'Q->q',
+     ('e->q', (None, 'my_argmax_float(a, b)', None, None)),
+     ('f->q', (None, 'my_argmax_float(a, b)', None, None)),
+     ('d->q', (None, 'my_argmax_float(a, b)', None, None)),
+     ('F->q', (None, 'my_argmax_float(a, b)', None, None)),
+     ('D->q', (None, 'my_argmax_float(a, b)', None, None))),
+    ('min_max_st<type_in0_raw>(in0, isnan(in0) ? -1 : _J)',
+     'my_argmax(a, b)', 'out0 = a.index', 'min_max_st<type_in0_raw>'),
+    None, _min_max_preamble, sort_reduce_axis=False)
+
+
+cdef _exists_nan = ReductionKernel(
+    'T x', 'bool y', 'isnan(x)', 'a || b', 'y = a', 'false', '_exists_nan')
+
+
+cpdef _ndarray_base _median(
+        _ndarray_base a, axis, out, overwrite_input, keepdims):
+
+    keep_ndim = a.ndim
+
+    out_shape = None
+    if sequence.PySequence_Check(axis):
+        # cupy.sort and cupy.partition only support integer axis, so move
+        # all reduced dimensions to the end and reshape them into a single
+        # reduction axis.
+        reduce_axis, out_axis = _reduction._get_axis(axis, keep_ndim)
+        out_shape = _reduction._get_out_shape(a.shape, reduce_axis, out_axis,
+                                              keepdims)
+        a = a.transpose(out_axis + reduce_axis)
+        sort_shape = tuple([a.shape[n] for n in range(len(out_axis))]) + (-1,)
+        a = a.reshape(sort_shape)
+        if not a.flags.c_contiguous:
+            a = cupy.ascontiguousarray(a)
+        axis = -1
+
+    if axis is None:
+        sz = a.size
+    else:
+        if axis < -keep_ndim or axis >= keep_ndim:
+            raise numpy.AxisError('Axis overrun')
+        sz = a.shape[axis]
+    if sz % 2 == 0:
+        szh = sz // 2
+        kth = [szh - 1, szh]
+    else:
+        kth = [(sz - 1) // 2]
+
+    if overwrite_input:
+        part = a
+    else:
+        part = a.copy()
+
+    if axis is None:
+        part = part.ravel()
+        part.partition(kth)
+    else:
+        part.partition(kth, axis=axis)
+
+    if part.shape == ():
+        return part
+    if axis is None:
+        axis = 0
+
+    indexer = [slice(None)] * part.ndim
+
+    if keepdims and out_shape is None:
+        _indexer = [None] * (keep_ndim - part.ndim)
+        indexer.extend(_indexer)
+
+    index = part.shape[axis] // 2
+    if part.shape[axis] % 2 == 1:
+        indexer[axis] = slice(index, index+1)
+    else:
+        indexer[axis] = slice(index-1, index+1)
+    indexer = tuple(indexer)
+
+    out = _mean(
+        part[indexer], axis=axis, dtype=None, out=out, keepdims=keepdims)
+    if part.dtype.kind in 'fc':
+        isnan = _exists_nan(part, axis=axis, keepdims=keepdims)
+        out = cupy.where(isnan, numpy.nan, out)
+    if out_shape is not None:
+        out = out.reshape(out_shape)
+    return out
+
+
+cpdef _ndarray_base _nanmedian(
+        _ndarray_base a, axis, out, overwrite_input, keepdims):
+
+    if axis is None:
+        axis = tuple(range(a.ndim))
+    if not sequence.PySequence_Check(axis):
+        axis = (axis,)
+
+    reduce_axis = []
+    reduce_shape = []
+    out_axis = []
+    out_shape = []
+    for i in range(a.ndim):
+        if axis is None or i in axis or i - a.ndim in axis:
+            reduce_axis.append(i)
+            reduce_shape.append(a.shape[i])
+        else:
+            out_axis.append(i)
+            out_shape.append(a.shape[i])
+
+    a_data_ptr = a.data.ptr
+    a = a.transpose(out_axis + reduce_axis)
+    a = a.reshape(out_shape + [-1, ])
+    a = cupy.ascontiguousarray(a)
+
+    n_reduce = numpy.prod(reduce_shape)
+    n_reduce_each = cupy.full(out_shape, n_reduce, dtype='int32')
+    if a_data_ptr == a.data.ptr and overwrite_input is False:
+        a = a.copy()
+    _replace_nan_kernel(n_reduce, numpy.finfo(a.dtype).max, a, n_reduce_each)
+    a = cupy.sort(a, axis=-1)
+
+    b = cupy.full(out_shape, cupy.nan, dtype=a.dtype)
+    _pickup_median_kernel(n_reduce, n_reduce_each, a, b)
+
+    if keepdims:
+        b = b.reshape(out_shape + [1, ] * len(reduce_axis))
+        axes = [-1, ] * b.ndim
+        for i, j in enumerate(out_axis + reduce_axis):
+            axes[j] = i
+        b = b.transpose(axes)
+
+    if out is None:
+        out = b
+    else:
+        elementwise_copy(b, out)
+    return out
+
+
+cdef _replace_nan_kernel = ElementwiseKernel(
+    'I n_reduce, T val', 'T a, raw I n_reduce_each',
+    '''
+    if (a != a) {
+        a = val;
+        atomicAdd(&(n_reduce_each[i / n_reduce]), -1);
+    }
+    ''',
+    'cupy_replace_nan'
+)
+
+cdef _pickup_median_kernel = ElementwiseKernel(
+    'I n_reduce, I n_reduce_each, raw T a', 'T b',
+    '''
+    if (n_reduce_each > 0) {
+        int l = (n_reduce_each - 1) / 2;
+        int h = (n_reduce_each    ) / 2;
+        if (l == h) {
+            b = a[l + n_reduce * i];
+        } else {
+            b = (a[l + n_reduce * i] + a[h + n_reduce * i])
+                / static_cast<T>(2.0);
+        }
+    }
+    ''',
+    'cupy_pickup_median'
+)
+
+
+cdef _ndarray_base _mean(
+        _ndarray_base a, axis=None, dtype=None, out=None, keepdims=False):
+    if a.size == 0:
+        # Return nan; see also https://github.com/numpy/numpy/issues/13582
+        return _mean_core_empty(a, axis, dtype, out, keepdims)
+    return _mean_core(a, axis, dtype, out, keepdims)
+
+cdef _ndarray_base _var(
+        _ndarray_base a, axis=None, dtype=None, out=None, ddof=0,
+        keepdims=False):
+
+    if axis is None:
+        axis = tuple(range(a.ndim))
+    if not isinstance(axis, tuple):
+        axis = (axis,)
+
+    dtype_mean = a.dtype
+    dtype_out = numpy.dtype(dtype)
+    if dtype is None:
+        if a.dtype.kind in 'biu':
+            dtype_mean = 'float64'
+            dtype_out = 'float64'
+        else:
+            dtype_mean = a.dtype
+            dtype_out = a.dtype
+            if a.dtype.kind == 'c':
+                dtype_out = numpy.dtype(a.dtype.char.lower())
+
+    shape = a.shape
+    cdef Py_ssize_t items = 1
+    for ax in axis:
+        items *= shape[ax]
+
+    # Make alpha NaN when array is empty, mimics NumPy behavior, resulting in
+    # NaN. See https://github.com/numpy/numpy/issues/13582 for an explanation
+    # on why NaN is the result.
+    div = max(items - ddof, 0)
+    alpha = 1. / div if div != 0 else nan
+
+    arrmean = a.mean(axis=axis, dtype=dtype_mean, out=None, keepdims=True)
+
+    if out is None:
+        if dtype_out == 'float16':
+            var_core = _var_core_float16
+        elif dtype_out == 'float32':
+            var_core = _var_core_float32
+        else:
+            var_core = _var_core_float64
+        return var_core(a, arrmean, alpha, axis=axis, keepdims=keepdims)
+
+    out = _var_core_out(a, arrmean, alpha, out, axis=axis, keepdims=keepdims)
+    return out.astype(dtype_out, copy=False)
+
+
+cdef _ndarray_base _std(
+        _ndarray_base a, axis=None, dtype=None, out=None, ddof=0,
+        keepdims=False):
+    ret = _var(
+        a, axis=axis, dtype=dtype, out=None, ddof=ddof, keepdims=keepdims)
+    return _math._sqrt(ret, dtype=dtype, out=out)
+
+
+cdef _norm_preamble = '''
+template <typename T> __device__ T my_norm(T x) { return x * x; }
+__device__ float my_norm(const complex<float>& x) { return norm(x); }
+__device__ double my_norm(const complex<double>& x) { return norm(x); }
+'''
+
+
+cdef _var_core_float16 = ReductionKernel(
+    'S x, T mean, float32 alpha', 'float16 out',
+    'my_norm(x - mean)',
+    'a + b', 'out = alpha * a', '0', 'cupy_var_core_float16',
+    preamble=_norm_preamble)
+
+
+cdef _var_core_float32 = ReductionKernel(
+    'S x, T mean, float32 alpha', 'float32 out',
+    'my_norm(x - mean)',
+    'a + b', 'out = alpha * a', '0', 'cupy_var_core_float32',
+    preamble=_norm_preamble)
+
+
+cdef _var_core_float64 = ReductionKernel(
+    'S x, T mean, float64 alpha', 'float64 out',
+    'my_norm(x - mean)',
+    'a + b', 'out = alpha * a', '0', 'cupy_var_core_float64',
+    preamble=_norm_preamble)
+
+
+cdef _var_core_out = ReductionKernel(
+    'S x, T mean, U alpha', 'U out',
+    'my_norm(x - mean)',
+    'a + b', 'out = alpha * a', '0', 'cupy_var_core_out',
+    preamble=_norm_preamble)
+
+
+# TODO(okuta) needs cast
+cdef _mean_core = create_reduction_func(
+    'cupy_mean',
+    ('?->d', 'B->d', 'h->d', 'H->d', 'i->d', 'I->d', 'l->d', 'L->d',
+     'q->d', 'Q->d',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('in0', 'a + b',
+     'out0 = a / _type_reduce(_in_ind.size() / _out_ind.size())', None))
+
+cdef _mean_core_empty = create_reduction_func(
+    'cupy_mean_empty',
+    ('?->d', 'B->d', 'h->d', 'H->d', 'i->d', 'I->d', 'l->d', 'L->d',
+     'q->d', 'Q->d',
+     ('e->e', (None, None, None, 'float')),
+     'f->f', 'd->d', 'F->F', 'D->D'),
+    ('in0', 'a + b',
+     'out0 = a / _type_reduce(_in_ind.size() / _out_ind.size())', None), 0)
+
+cdef _nanmean_preamble = '''
+template <typename T>
+struct nanmean_st{
+    typedef long long ll;
+    T value;
+    ll count;
+    __device__ nanmean_st() : value(0), count(0) { }
+    __device__ nanmean_st(T v) :
+        value(isnan(v) ? T(0) : v), count(isnan(v) ? 0 : 1) { }
+    __device__ nanmean_st(T v, ll c) : value(v), count(c) { }
+};
+
+template <typename T>
+__device__ nanmean_st<T> my_nanmean(
+        const nanmean_st<T>& a, const nanmean_st<T>& b) {
+    return nanmean_st<T>(a.value + b.value, a.count + b.count);
+}
+'''
+
+
+cdef _nanmean_func = create_reduction_func(
+    'cupy_nanmean',
+    ('e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
+    ('in0', 'my_nanmean(a, b)',
+     'out0 = a.value / type_out0_raw(a.count)', 'nanmean_st<type_out0_raw>'),
+    None, _nanmean_preamble)
+
+
+_count_non_nan = create_reduction_func(
+    'cupy_count_non_nan',
+    ('e->q', 'f->q', 'd->q', 'F->q', 'D->q'),
+    ('isnan(in0) ? 0 : 1', 'a + b', 'out0 = a', None), 0)
+
+
+cpdef _ndarray_base _nanmean(_ndarray_base a, axis, dtype, out, keepdims):
+    return _nanmean_func(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
+
+
+cpdef _ndarray_base _nanstd(_ndarray_base a, axis, dtype, out, ddof, keepdims):
+    var = _nanvar(a, axis, dtype, None, ddof, keepdims)
+    return _math._sqrt(var, dtype=dtype, out=out)
+
+
+cpdef _ndarray_base _nanvar(_ndarray_base a, axis, dtype, out, ddof, keepdims):
+
+    _count = _count_non_nan(a, axis=axis, keepdims=True)
+    arrsum = _math._nansum(a, axis=axis, dtype=dtype, out=None, keepdims=True)
+
+    if out is None:
+        if a.dtype == cupy.complex64 or dtype == cupy.complex64:
+            nanvar_core = _nanvar_core_complex64
+        elif a.dtype == cupy.complex128 or dtype == cupy.complex128:
+            nanvar_core = _nanvar_core_complex128
+        else:
+            nanvar_core = _nanvar_core
+        out = nanvar_core(
+            a, arrsum, _count, ddof, axis=axis, keepdims=keepdims)
+    else:
+        _nanvar_core_out(
+            a, arrsum, _count, ddof, out, axis=axis, keepdims=keepdims)
+    return out
+
+
+cdef _nanvar_preamble = '''
+template <typename S, typename T>
+__device__ T nanvar_impl(S x, T mean, long long alpha) {
+    return (isnan(x) ? T(0) : T((x - mean) * (x - mean))) / alpha;
+}
+
+template <typename S, typename T>
+__device__ T nanvar_impl(complex<S> x, complex<T> mean, long long alpha) {
+    return (isnan(x) ? T(0) : T(norm(x - mean))) / alpha;
+}
+'''
+
+
+cdef _nanvar_core = ReductionKernel(
+    'S x, T sum, int64 _count, int64 ddof', 'S out',
+    'nanvar_impl(x, sum / _count, max(_count - ddof, 0LL))',
+    'a + b', 'out = a', '0', '_nanvar_core', preamble=_nanvar_preamble)
+
+
+cdef _nanvar_core_complex64 = ReductionKernel(
+    'complex64 x, complex64 sum, int64 _count, int64 ddof', 'float32 out',
+    'nanvar_impl(x, sum/static_cast<float>(_count), max(_count-ddof, 0LL))',
+    'a + b', 'out = a', '0', '_nanvar_core_complex64',
+    preamble=_nanvar_preamble)
+
+
+cdef _nanvar_core_complex128 = ReductionKernel(
+    'complex128 x, complex128 sum, int64 _count, int64 ddof', 'float64 out',
+    'nanvar_impl(x, sum/static_cast<double>(_count), max(_count-ddof, 0LL))',
+    'a + b', 'out = a', '0', '_nanvar_core_complex128',
+    preamble=_nanvar_preamble)
+
+
+cdef _nanvar_core_out = ReductionKernel(
+    'S x, T sum, int64 _count, int64 ddof', 'U out',
+    'nanvar_impl(x, sum / static_cast<T>(_count), max(_count - ddof, 0LL))',
+    'a + b', 'out = a', '0', '_nanvar_core', preamble=_nanvar_preamble)
+
+
+# Variables to expose to Python
+# (cythonized data cannot be exposed to Python, even with cpdef.)
+
+
+amax = _amax
+amin = _amin
--- a/cupy/_core/_scalar.pxd
+++ b/cupy/_core/_scalar.pxd
+cimport cython  # NOQA
+
+from libc.stdint cimport int8_t
+from libc.stdint cimport int32_t
+
+from cupy.cuda.function cimport CPointer
+
+
+@cython.final
+cdef class CScalar(CPointer):
+
+    cdef:
+        char kind
+        int8_t size
+
+    @staticmethod
+    cdef CScalar from_int32(int32_t value)
+
+    @staticmethod
+    cdef CScalar from_numpy_scalar_with_dtype(object x, object dtype)
+
+    @staticmethod
+    cdef CScalar _from_python_scalar(object x)
+
+    @staticmethod
+    cdef CScalar _from_numpy_scalar(object x)
+
+    cpdef apply_dtype(self, dtype)
+    cpdef get_numpy_type(self)
+
+
+cpdef str get_typename(dtype)
+
+cdef set scalar_type_set
+cdef CScalar scalar_to_c_scalar(object x)
+cdef object scalar_to_numpy_scalar(object x)
+cpdef str _get_cuda_scalar_repr(obj, dtype)
--- a/cupy/_core/_scalar.pyx
+++ b/cupy/_core/_scalar.pyx
+from cpython cimport mem
+from libc.stdint cimport int8_t
+from libc.stdint cimport int16_t
+from libc.stdint cimport int32_t
+from libc.stdint cimport int64_t
+from libc.stdint cimport uint8_t
+from libc.stdint cimport uint16_t
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
+
+import numpy
+
+from cupy._core cimport _dtype
+from cupy._core import _dtype as _dtype_module
+from cupy._core cimport internal
+
+
+cdef union Scalar:
+    bint bool_
+    int8_t int8_
+    int16_t int16_
+    int32_t int32_
+    int64_t int64_
+    uint8_t uint8_
+    uint16_t uint16_
+    uint32_t uint32_
+    uint64_t uint64_
+    float float32_
+    double float64_
+
+
+cdef dict _typenames_base = {
+    numpy.dtype('float64'): 'double',
+    numpy.dtype('float32'): 'float',
+    numpy.dtype('float16'): 'float16',
+    numpy.dtype('complex128'): 'complex<double>',
+    numpy.dtype('complex64'): 'complex<float>',
+    numpy.dtype('int64'): 'long long',
+    numpy.dtype('int32'): 'int',
+    numpy.dtype('int16'): 'short',
+    numpy.dtype('int8'): 'signed char',
+    numpy.dtype('uint64'): 'unsigned long long',
+    numpy.dtype('uint32'): 'unsigned int',
+    numpy.dtype('uint16'): 'unsigned short',
+    numpy.dtype('uint8'): 'unsigned char',
+    numpy.dtype('bool'): 'bool',
+}
+
+
+cdef object _numpy_bool_ = numpy.bool_
+cdef object _numpy_int8 = numpy.int8
+cdef object _numpy_int16 = numpy.int16
+cdef object _numpy_int32 = numpy.int32
+cdef object _numpy_int64 = numpy.int64
+cdef object _numpy_uint8 = numpy.uint8
+cdef object _numpy_uint16 = numpy.uint16
+cdef object _numpy_uint32 = numpy.uint32
+cdef object _numpy_uint64 = numpy.uint64
+cdef object _numpy_float16 = numpy.float16
+cdef object _numpy_float32 = numpy.float32
+cdef object _numpy_float64 = numpy.float64
+cdef object _numpy_complex64 = numpy.complex64
+cdef object _numpy_complex128 = numpy.complex128
+cdef object _numpy_float_ = numpy.float_
+cdef object _numpy_complex_ = numpy.complex_
+
+
+cpdef str get_typename(dtype):
+    if dtype is None:
+        raise ValueError('dtype is None')
+    if dtype not in _typenames:
+        dtype = _dtype.get_dtype(dtype).type
+    return _typenames[dtype]
+
+
+cdef dict _typenames = {}
+cdef dict _dtype_kind_size_dict = {}
+
+
+cdef _setup_type_dict():
+    cdef char k
+    for i in _dtype_module.all_type_chars:
+        d = numpy.dtype(i)
+        t = d.type
+        _typenames[t] = _typenames_base[d]
+        k = ord(d.kind)
+        _dtype_kind_size_dict[t] = (k, d.itemsize)
+    # CUDA types
+    for t in ('cudaTextureObject_t',):
+        _typenames[t] = t
+
+
+_setup_type_dict()
+
+
+cdef set _python_scalar_type_set = {int, float, bool, complex}
+cdef set _numpy_scalar_type_set = set(_typenames.keys())
+cdef set scalar_type_set = _python_scalar_type_set | _numpy_scalar_type_set
+
+
+_int_iinfo = numpy.iinfo(int)
+cdef _int_min = _int_iinfo.min
+cdef _int_max = _int_iinfo.max
+cdef _int_type = _int_iinfo.dtype.type
+cdef bint _use_int32 = _int_type != _numpy_int64
+del _int_iinfo
+
+
+cpdef _python_scalar_to_numpy_scalar(x):
+    # Note that isinstance(x, int) matches with bool.
+    typ = type(x)
+    if typ is bool:
+        numpy_type = _numpy_bool_
+    elif typ is float:
+        numpy_type = _numpy_float_
+    elif typ is complex:
+        numpy_type = _numpy_complex_
+    else:
+        if 0x8000000000000000 <= x:
+            numpy_type = _numpy_uint64
+        elif _use_int32 and (x < _int_min or _int_max < x):
+            numpy_type = _numpy_int64
+        else:
+            # Generally `_int_type` is `numpy.int64`.
+            # On Windows, it is `numpy.int32`.
+            numpy_type = _int_type
+    return numpy_type(x)
+
+
+cdef class CScalar(CPointer):
+
+    ndim = 0
+
+    def __cinit__(self):
+        self.ptr = mem.PyMem_Malloc(
+            max(sizeof(Scalar), sizeof(double complex)))
+        self.kind = 0
+        self.size = -1
+
+    def __dealloc__(self):
+        mem.PyMem_Free(self.ptr)
+        self.ptr = <void*>0
+
+    @staticmethod
+    cdef CScalar from_int32(int32_t value):
+        cdef CScalar s = CScalar.__new__(CScalar)
+        (<int32_t *>s.ptr)[0] = value
+        s.kind = b'i'
+        s.size = 4
+        return s
+
+    @staticmethod
+    cdef CScalar from_numpy_scalar_with_dtype(object x, object dtype):
+        cdef CScalar ret = CScalar._from_numpy_scalar(x)
+        ret.apply_dtype(dtype)
+        return ret
+
+    @staticmethod
+    cdef CScalar _from_python_scalar(object x):
+        cdef CScalar ret = CScalar.__new__(CScalar)
+        cdef Scalar* s = <Scalar*>ret.ptr
+        typ = type(x)
+        if typ is bool:
+            s.bool_ = x
+            ret.kind = b'b'
+            ret.size = 1
+        elif typ is float:
+            s.float64_ = x
+            ret.kind = b'f'
+            ret.size = 8
+        elif typ is complex:
+            (<double complex*>ret.ptr)[0] = x
+            ret.kind = b'c'
+            ret.size = 16
+        else:
+            if 0x8000000000000000 <= x:
+                s.uint64_ = x
+                ret.kind = b'u'
+            else:
+                s.int64_ = x
+                ret.kind = b'i'
+            ret.size = 8
+        return ret
+
+    @staticmethod
+    cdef CScalar _from_numpy_scalar(object x):
+        cdef CScalar ret = CScalar.__new__(CScalar)
+        cdef Scalar* s = <Scalar*>ret.ptr
+        ret.kind = ord(x.dtype.kind)
+        if ret.kind == b'i':
+            s.int64_ = x
+            ret.size = 8
+        elif ret.kind == b'u':
+            s.uint64_ = x
+            ret.size = 8
+        elif ret.kind == b'f':
+            s.float64_ = x
+            ret.size = 8
+        elif ret.kind == b'b':
+            s.bool_ = x
+            ret.size = 1
+        elif ret.kind == b'c':
+            (<double complex*>ret.ptr)[0] = x
+            ret.size = 16
+        else:
+            assert False
+        return ret
+
+    cpdef apply_dtype(self, dtype):
+        cdef Scalar* s = <Scalar*>self.ptr
+        if self.kind == b'b':
+            val = s.bool_
+            assert self.size == 1
+        elif self.kind == b'c':
+            assert self.size == 16
+            val = (<double complex*>self.ptr)[0]
+        else:
+            assert self.size == 8
+            if self.kind == b'i':
+                val = s.int64_
+            elif self.kind == b'u':
+                val = s.uint64_
+            elif self.kind == b'f':
+                val = s.float64_
+            else:
+                assert False
+        cdef char kind
+        cdef int size
+        kind, size = <tuple>_dtype_kind_size_dict[dtype]
+        cdef int64_t val_i
+        cdef uint64_t val_u
+        if kind == b'b':
+            s.bool_ = val
+            assert size == 1
+        elif kind == b'i':
+            if self.kind == b'u':
+                # avoid overflow exception
+                val_i = s.uint64_
+            else:
+                val_i = val
+            if size == 1:
+                s.int8_ = val_i
+            elif size == 2:
+                s.int16_ = val_i
+            elif size == 4:
+                s.int32_ = val_i
+            elif size == 8:
+                s.int64_ = val_i
+            else:
+                assert False
+        elif kind == b'u':
+            if self.kind == b'i':
+                # avoid overflow exception
+                val_u = s.int64_
+            else:
+                val_u = val
+            if size == 1:
+                s.uint8_ = val_u
+            elif size == 2:
+                s.uint16_ = val_u
+            elif size == 4:
+                s.uint32_ = val_u
+            elif size == 8:
+                s.uint64_ = val_u
+            else:
+                assert False
+        elif kind == b'f':
+            if size == 2:
+                s.uint16_ = internal.to_float16(<float>val)
+            elif size == 4:
+                s.float32_ = val
+            elif size == 8:
+                s.float64_ = val
+            else:
+                assert False
+        elif kind == b'c':
+            if size == 8:
+                (<float complex*>self.ptr)[0] = val
+            elif size == 16:
+                (<double complex*>self.ptr)[0] = val
+            else:
+                assert False
+        else:
+            assert False
+        self.kind = kind
+        self.size = size
+
+    cpdef get_numpy_type(self):
+        if self.kind == b'b':
+            return _numpy_bool_
+        elif self.kind == b'i':
+            if self.size == 1:
+                return _numpy_int8
+            elif self.size == 2:
+                return _numpy_int16
+            elif self.size == 4:
+                return _numpy_int32
+            elif self.size == 8:
+                return _numpy_int64
+        elif self.kind == b'u':
+            if self.size == 1:
+                return _numpy_uint8
+            elif self.size == 2:
+                return _numpy_uint16
+            elif self.size == 4:
+                return _numpy_uint32
+            elif self.size == 8:
+                return _numpy_uint64
+        elif self.kind == b'f':
+            if self.size == 2:
+                return _numpy_float16
+            elif self.size == 4:
+                return _numpy_float32
+            elif self.size == 8:
+                return _numpy_float64
+        elif self.kind == b'c':
+            if self.size == 8:
+                return _numpy_complex64
+            elif self.size == 16:
+                return _numpy_complex128
+        assert False
+
+
+cdef CScalar scalar_to_c_scalar(object x):
+    # Converts a Python or NumPy scalar to a CScalar.
+    # Returns None if the argument is not a scalar.
+    typ = type(x)
+    if typ in _python_scalar_type_set:
+        return CScalar._from_python_scalar(x)
+    elif typ in _numpy_scalar_type_set:
+        return CScalar._from_numpy_scalar(x)
+    return None
+
+
+cdef object scalar_to_numpy_scalar(object x):
+    # Converts a Python or NumPy scalar to a NumPy scalar.
+    # Returns None if the argument is not a scalar.
+    typ = type(x)
+    if typ in _python_scalar_type_set:
+        return _python_scalar_to_numpy_scalar(x)
+    elif typ in _numpy_scalar_type_set:
+        return x
+    return None
+
+
+cpdef str _get_cuda_scalar_repr(obj, dtype):
+    if dtype.kind == 'b':
+        return str(bool(obj)).lower()
+    elif dtype.kind == 'i':
+        if dtype.itemsize < 8:
+            return str(int(obj))
+        else:
+            return str(int(obj)) + 'll'
+    elif dtype.kind == 'u':
+        if dtype.itemsize < 8:
+            return str(int(obj)) + 'u'
+        else:
+            return str(int(obj)) + 'ull'
+    elif dtype.kind == 'f':
+        if dtype.itemsize < 8:
+            if numpy.isnan(obj):
+                return 'CUDART_NAN_F'
+            elif numpy.isinf(obj):
+                if obj > 0:
+                    return 'CUDART_INF_F'
+                else:
+                    return '-CUDART_INF_F'
+            else:
+                return str(float(obj)) + 'f'
+        else:
+            if numpy.isnan(obj):
+                return 'CUDART_NAN'
+            elif numpy.isinf(obj):
+                if obj > 0:
+                    return 'CUDART_INF'
+                else:
+                    return '-CUDART_INF'
+            else:
+                return str(float(obj))
+    elif dtype.kind == 'c':
+        if dtype.itemsize == 8:
+            return f'thrust::complex<float>({obj.real}, {obj.imag})'
+        elif dtype.itemsize == 16:
+            return f'thrust::complex<double>({obj.real}, {obj.imag})'
+
+    raise TypeError(f'Unsupported dtype: {dtype}')
--- a/cupy/_core/_ufuncs.py
+++ b/cupy/_core/_ufuncs.py
+from cupy._core._kernel import create_ufunc
+
+
+elementwise_copy = create_ufunc(
+    'cupy_copy',
+    ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L',
+     'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'),
+    'out0 = in0',
+    default_casting='unsafe')
--- a/cupy/_core/core.pxd
+++ b/cupy/_core/core.pxd
+from libcpp cimport vector
+from cupy.cuda cimport memory
+
+from cupy.cuda.function cimport CPointer
+from cupy.cuda.function cimport Module
+from cupy._core._carray cimport shape_t
+from cupy._core._carray cimport strides_t
+
+
+cdef class _ndarray_base:
+    cdef:
+        object __weakref__
+        readonly Py_ssize_t size
+        public shape_t _shape
+        public strides_t _strides
+        readonly bint _c_contiguous
+        readonly bint _f_contiguous
+        # To do fast indexing in the CArray class
+        readonly bint _index_32_bits
+        readonly object dtype
+        readonly memory.MemoryPointer data
+        # TODO(niboshi): Return arbitrary owner object as `base` if the
+        # underlying memory is UnownedMemory.
+        readonly _ndarray_base base
+
+    cdef _init_fast(self, const shape_t& shape, dtype, bint c_order)
+    cpdef item(self)
+    cpdef tolist(self)
+    cpdef bytes tobytes(self, order=*)
+    cpdef tofile(self, fid, sep=*, format=*)
+    cpdef dump(self, file)
+    cpdef bytes dumps(self)
+    cpdef _ndarray_base astype(
+        self, dtype, order=*, casting=*, subok=*, copy=*)
+    cpdef _ndarray_base copy(self, order=*)
+    cpdef _ndarray_base view(self, dtype=*, array_class=*)
+    cpdef fill(self, value)
+    cpdef _ndarray_base swapaxes(self, Py_ssize_t axis1, Py_ssize_t axis2)
+    cpdef _ndarray_base flatten(self, order=*)
+    cpdef _ndarray_base ravel(self, order=*)
+    cpdef _ndarray_base squeeze(self, axis=*)
+    cpdef _ndarray_base take(self, indices, axis=*, out=*)
+    cpdef put(self, indices, values, mode=*)
+    cpdef repeat(self, repeats, axis=*)
+    cpdef choose(self, choices, out=*, mode=*)
+    cpdef sort(self, int axis=*)
+    cpdef _ndarray_base argsort(self, axis=*)
+    cpdef partition(self, kth, int axis=*)
+    cpdef _ndarray_base argpartition(self, kth, axis=*)
+    cpdef tuple nonzero(self)
+    cpdef _ndarray_base compress(self, condition, axis=*, out=*)
+    cpdef _ndarray_base diagonal(self, offset=*, axis1=*, axis2=*)
+    cpdef _ndarray_base max(self, axis=*, out=*, keepdims=*)
+    cpdef _ndarray_base argmax(self, axis=*, out=*, dtype=*, keepdims=*)
+    cpdef _ndarray_base min(self, axis=*, out=*, keepdims=*)
+    cpdef _ndarray_base argmin(self, axis=*, out=*, dtype=*, keepdims=*)
+    cpdef _ndarray_base ptp(self, axis=*, out=*, keepdims=*)
+    cpdef _ndarray_base clip(self, min=*, max=*, out=*)
+    cpdef _ndarray_base round(self, decimals=*, out=*)
+
+    cpdef _ndarray_base trace(self, offset=*, axis1=*, axis2=*, dtype=*, out=*)
+    cpdef _ndarray_base sum(self, axis=*, dtype=*, out=*, keepdims=*)
+    cpdef _ndarray_base cumsum(self, axis=*, dtype=*, out=*)
+    cpdef _ndarray_base mean(self, axis=*, dtype=*, out=*, keepdims=*)
+    cpdef _ndarray_base var(self, axis=*, dtype=*, out=*, ddof=*, keepdims=*)
+    cpdef _ndarray_base std(self, axis=*, dtype=*, out=*, ddof=*, keepdims=*)
+    cpdef _ndarray_base prod(self, axis=*, dtype=*, out=*, keepdims=*)
+    cpdef _ndarray_base cumprod(self, axis=*, dtype=*, out=*)
+    cpdef _ndarray_base _add_reduceat(self, indices, axis, dtype, out)
+    cpdef _ndarray_base all(self, axis=*, out=*, keepdims=*)
+    cpdef _ndarray_base any(self, axis=*, out=*, keepdims=*)
+    cpdef _ndarray_base conj(self)
+    cpdef _ndarray_base conjugate(self)
+    cpdef get(self, stream=*, order=*, out=*)
+    cpdef set(self, arr, stream=*)
+    cpdef _ndarray_base reduced_view(self, dtype=*)
+    cpdef _update_c_contiguity(self)
+    cpdef _update_f_contiguity(self)
+    cpdef _update_contiguity(self)
+    cpdef _set_shape_and_strides(self, const shape_t& shape,
+                                 const strides_t& strides,
+                                 bint update_c_contiguity,
+                                 bint update_f_contiguity)
+    cdef _ndarray_base _view(self, subtype, const shape_t& shape,
+                             const strides_t& strides,
+                             bint update_c_contiguity,
+                             bint update_f_contiguity, obj)
+    cpdef _set_contiguous_strides(
+        self, Py_ssize_t itemsize, bint is_c_contiguous)
+    cdef CPointer get_pointer(self)
+    cpdef object toDlpack(self)
+
+
+cpdef _ndarray_base _internal_ascontiguousarray(_ndarray_base a)
+cpdef _ndarray_base _internal_asfortranarray(_ndarray_base a)
+cpdef _ndarray_base ascontiguousarray(_ndarray_base a, dtype=*)
+cpdef _ndarray_base asfortranarray(_ndarray_base a, dtype=*)
+
+cpdef Module compile_with_cache(str source, tuple options=*, arch=*,
+                                cachd_dir=*, prepend_cupy_headers=*,
+                                backend=*, translate_cucomplex=*,
+                                enable_cooperative_groups=*,
+                                name_expressions=*, log_stream=*,
+                                bint jitify=*)
+
+
+# TODO(niboshi): Move to _routines_creation.pyx
+cpdef _ndarray_base array(
+    obj, dtype=*, bint copy=*, order=*, bint subok=*, Py_ssize_t ndmin=*)
+cpdef _ndarray_base _convert_object_with_cuda_array_interface(a)
+
+cdef _ndarray_base _ndarray_init(subtype, const shape_t& shape, dtype, obj)
+
+cdef _ndarray_base _create_ndarray_from_shape_strides(
+    subtype, const shape_t& shape, const strides_t& strides, dtype, obj)
--- a/cupy/_core/core.pyx
+++ b/cupy/_core/core.pyx
+# distutils: language = c++
+
+import contextlib
+import functools
+import os
+import pickle
+import re
+import warnings
+
+import numpy
+
+import cupy
+from cupy._core._kernel import create_ufunc
+from cupy._core._kernel import ElementwiseKernel
+from cupy._core._ufuncs import elementwise_copy
+from cupy._core import flags
+from cupy._core import syncdetect
+from cupy import cuda
+from cupy.cuda import memory as memory_module
+from cupy.cuda import stream as stream_mod
+
+
+from cupy_backends.cuda.api.runtime import CUDARuntimeError
+from cupy import _util
+
+cimport cython  # NOQA
+from libc.stdint cimport int64_t, intptr_t
+
+from cupy._core cimport _carray
+from cupy._core cimport _dtype
+from cupy._core._dtype cimport get_dtype
+from cupy._core._kernel cimport create_ufunc
+from cupy._core cimport _routines_binary as _binary
+from cupy._core cimport _routines_indexing as _indexing
+from cupy._core cimport _routines_linalg as _linalg
+from cupy._core cimport _routines_logic as _logic
+from cupy._core cimport _routines_manipulation as _manipulation
+from cupy._core cimport _routines_math as _math
+from cupy._core cimport _routines_sorting as _sorting
+from cupy._core cimport _routines_statistics as _statistics
+from cupy._core cimport _scalar
+from cupy._core cimport dlpack
+from cupy._core cimport internal
+from cupy.cuda cimport device
+from cupy.cuda cimport function
+from cupy.cuda cimport pinned_memory
+from cupy.cuda cimport memory
+from cupy.cuda cimport stream as stream_module
+from cupy_backends.cuda cimport stream as _stream_module
+from cupy_backends.cuda.api cimport runtime
+from cupy_backends.cuda.libs cimport cublas
+
+
+# If rop of cupy.ndarray is called, cupy's op is the last chance.
+# If op of cupy.ndarray is called and the `other` is cupy.ndarray, too,
+# it is safe to call cupy's op.
+# Otherwise, use this function `_should_use_rop` to choose
+# * [True] return NotImplemented to defer rhs, or
+# * [False] call NumPy's ufunc to try all `__array_ufunc__`.
+# Note that extension types (`cdef class`) in Cython 0.x shares
+# implementations of op and rop. (i.e. `__radd__(self, other)` is
+# `__add__(other, self)`.)
+#
+# It follows NEP 13 except that cupy also implements the fallback to
+# `__array_priority__`, which seems fair and necessary because of the
+# following facts:
+# * `numpy` : `scipy.sparse` = `cupy` : `cupyx.scipy.sparse`;
+# * NumPy ignores `__array_priority__` attributes of arguments if NumPy finds
+#   `__array_function__` of `cupy.ndarray`;
+# * SciPy sparse classes don't implement `__array_function__` and they even
+#   don't set `__array_function__ = None` to opt-out the feature; and
+# * `__array_priority__` of SciPy sparse classes is respected because
+#   `numpy.ndarray.__array_function__` does not disable `__array_priority__`.
+@cython.profile(False)
+cdef inline _should_use_rop(x, y):
+    try:
+        y_ufunc = y.__array_ufunc__
+    except AttributeError:
+        # NEP 13's recommendation is `return False`.
+        xp = getattr(x, '__array_priority__', 0)
+        yp = getattr(y, '__array_priority__', 0)
+        return xp < yp
+    return y_ufunc is None
+
+
+cdef tuple _HANDLED_TYPES
+
+cdef object _null_context = contextlib.nullcontext()
+
+
+class ndarray(_ndarray_base):
+    """
+    __init__(self, shape, dtype=float, memptr=None, strides=None, order='C')
+
+    Multi-dimensional array on a CUDA device.
+
+    This class implements a subset of methods of :class:`numpy.ndarray`.
+    The difference is that this class allocates the array content on the
+    current GPU device.
+
+    Args:
+        shape (tuple of ints): Length of axes.
+        dtype: Data type. It must be an argument of :class:`numpy.dtype`.
+        memptr (cupy.cuda.MemoryPointer): Pointer to the array content head.
+        strides (tuple of ints or None): Strides of data in memory.
+        order ({'C', 'F'}): Row-major (C-style) or column-major
+            (Fortran-style) order.
+
+    Attributes:
+        base (None or cupy.ndarray): Base array from which this array is
+            created as a view.
+        data (cupy.cuda.MemoryPointer): Pointer to the array content head.
+        ~ndarray.dtype(numpy.dtype): Dtype object of element type.
+
+            .. seealso::
+               `Data type objects (dtype) \
+               <https://numpy.org/doc/stable/reference/arrays.dtypes.html>`_
+        ~ndarray.size (int): Number of elements this array holds.
+
+            This is equivalent to product over the shape tuple.
+
+            .. seealso:: :attr:`numpy.ndarray.size`
+
+    """
+
+    __module__ = 'cupy'
+
+    def __new__(cls, *args, _obj=None, _no_init=False, **kwargs):
+        x = super().__new__(cls, *args, **kwargs)
+        if _no_init:
+            return x
+        x._init(*args, **kwargs)
+        if cls is not ndarray:
+            x.__array_finalize__(_obj)
+        return x
+
+    def __init__(self, *args, **kwargs):
+        # Prevent from calling the super class `_ndarray_base.__init__()` as
+        # it is used to check accidental direct instantiation of underlaying
+        # `_ndarray_base` extention.
+        pass
+
+    def __array_finalize__(self, obj):
+        pass
+
+    # We provide the Python-level wrapper of `view` method to follow NumPy's
+    # API signature, as it seems that Cython's `cpdef`d methods does not take
+    # an argument named `type`. Cython also does not take starargs
+    # (`*args` and `**kwargs`) for `cpdef`d methods so we can not interpret the
+    # arguments `dtype` and `type` from them.
+    def view(self, dtype=None, type=None):
+        """Returns a view of the array.
+
+        Args:
+            dtype: If this is different from the data type of the array, the
+                returned view reinterpret the memory sequence as an array of
+                this type.
+
+        Returns:
+            cupy.ndarray: A view of the array. A reference to the original
+            array is stored at the :attr:`~ndarray.base` attribute.
+
+        .. seealso:: :meth:`numpy.ndarray.view`
+
+        """
+        return super(ndarray, self).view(dtype=dtype, array_class=type)
+
+
+cdef class _ndarray_base:
+
+    def __init__(self, *args, **kwargs):
+        # Raise an error if underlaying `_ndarray_base` extension type is
+        # directly instantiated. We must instantiate `ndarray` class instead
+        # for our ndarray subclassing mechanism.
+        raise RuntimeError('Must not be directly instantiated')
+
+    def _init(self, shape, dtype=float, memptr=None, strides=None,
+              order='C'):
+        cdef Py_ssize_t x, itemsize
+        cdef tuple s = internal.get_size(shape)
+        del shape
+
+        cdef int order_char = (
+            b'C' if order is None else internal._normalize_order(order))
+
+        # `strides` is prioritized over `order`, but invalid `order` should be
+        # checked even if `strides` is given.
+        if order_char != b'C' and order_char != b'F':
+            raise ValueError('order not understood. order=%s' % order)
+
+        # Check for erroneous shape
+        if len(s) > _carray.MAX_NDIM:
+            msg = 'maximum supported dimension for an ndarray is '
+            msg += f'{_carray.MAX_NDIM}, found {len(s)}'
+            raise ValueError(msg)
+        self._shape.reserve(len(s))
+        for x in s:
+            if x < 0:
+                raise ValueError('Negative dimensions are not allowed')
+            self._shape.push_back(x)
+        del s
+
+        # dtype
+        self.dtype, itemsize = _dtype.get_dtype_with_itemsize(dtype)
+
+        # Store shape and strides
+        if strides is not None:
+            if memptr is None:
+                raise ValueError('memptr is required if strides is given.')
+            self._set_shape_and_strides(self._shape, strides, True, True)
+        elif order_char == b'C':
+            self._set_contiguous_strides(itemsize, True)
+        elif order_char == b'F':
+            self._set_contiguous_strides(itemsize, False)
+        else:
+            assert False
+
+        # data
+        if memptr is None:
+            self.data = memory.alloc(self.size * itemsize)
+            self._index_32_bits = (self.size * itemsize) <= (1 << 31)
+        else:
+            self.data = memptr
+            bound = cupy._core._memory_range.get_bound(self)
+            self._index_32_bits = bound[1] - bound[0] <= (1 << 31)
+
+    cdef _init_fast(self, const shape_t& shape, dtype, bint c_order):
+        """ For internal ndarray creation. """
+        cdef Py_ssize_t itemsize
+        if shape.size() > _carray.MAX_NDIM:
+            msg = 'maximum supported dimension for an ndarray is '
+            msg += f'{_carray.MAX_NDIM}, found {shape.size()}'
+            raise ValueError(msg)
+        self._shape = shape
+        self.dtype, itemsize = _dtype.get_dtype_with_itemsize(dtype)
+        self._set_contiguous_strides(itemsize, c_order)
+        self.data = memory.alloc(self.size * itemsize)
+        self._index_32_bits = (self.size * itemsize) <= (1 << 31)
+
+    @property
+    def __cuda_array_interface__(self):
+        if runtime._is_hip_environment:
+            raise AttributeError(
+                'HIP/ROCm does not support cuda array interface')
+        cdef dict desc = {
+            'shape': self.shape,
+            'typestr': self.dtype.str,
+            'descr': self.dtype.descr,
+        }
+        cdef int ver = _util.CUDA_ARRAY_INTERFACE_EXPORT_VERSION
+        cdef intptr_t stream_ptr
+
+        if ver == 3:
+            stream_ptr = stream_module.get_current_stream_ptr()
+            # CAI v3 says setting the stream field to 0 is disallowed
+            if stream_ptr == 0:
+                stream_ptr = _stream_module.get_default_stream_ptr()
+            desc['stream'] = stream_ptr
+        elif ver == 2:
+            # Old behavior (prior to CAI v3): stream sync is explicitly handled
+            # by users. To restore this behavior, we do not export any stream
+            # if CUPY_CUDA_ARRAY_INTERFACE_EXPORT_VERSION is set to 2 (so that
+            # other participating libraries lacking a finer control over sync
+            # behavior can avoid syncing).
+            pass
+        else:
+            raise ValueError('CUPY_CUDA_ARRAY_INTERFACE_EXPORT_VERSION can '
+                             'only be set to 3 (default) or 2')
+        desc['version'] = ver
+        if self._c_contiguous:
+            desc['strides'] = None
+        else:
+            desc['strides'] = self.strides
+        if self.size > 0:
+            desc['data'] = (self.data.ptr, False)
+        else:
+            desc['data'] = (0, False)
+
+        return desc
+
+    def __dlpack__(self, stream=None):
+        # Note: the stream argument is supplied by the consumer, not by CuPy
+        curr_stream = stream_module.get_current_stream()
+        curr_stream_ptr = curr_stream.ptr
+
+        # stream must be an int for CUDA/ROCm
+        if not runtime._is_hip_environment:  # CUDA
+            if stream is None:
+                stream = runtime.streamLegacy
+            elif not isinstance(stream, int) or stream < -1:
+                # DLPack does not accept 0 as a valid stream, but there is a
+                # bug in PyTorch that exports the default stream as 0, which
+                # renders the protocol unusable, we will accept a 0 value
+                # meanwhile.
+                raise ValueError(
+                    f'On CUDA, the valid stream for the DLPack protocol is -1,'
+                    f' 1, 2, or any larger value, but {stream} was provided')
+            if stream == 0:
+                warnings.warn(
+                    'Stream 0 is passed from a library that you are'
+                    ' converting to; CuPy assumes 0 as a legacy default '
+                    'stream. Please report this problem to the library as this'
+                    ' violates the DLPack protocol.')
+                stream = runtime.streamLegacy
+            if curr_stream_ptr == 0:
+                curr_stream_ptr = runtime.streamLegacy
+        else:  # ROCm/HIP
+            if stream is None:
+                stream = 0
+            elif (not isinstance(stream, int) or stream < -1
+                    or stream in (1, 2)):
+                raise ValueError(
+                    f'On ROCm/HIP, the valid stream for the DLPack protocol is'
+                    f' -1, 0, or any value > 2, but {stream} was provided')
+
+        # if -1, no stream order should be established; otherwise, the consumer
+        # stream should wait for the work on CuPy's current stream to finish
+        if stream >= 0 and stream != curr_stream_ptr:
+            next_stream = stream_mod.ExternalStream(stream)
+            event = curr_stream.record()
+            next_stream.wait_event(event)
+
+        return dlpack.toDlpack(self)
+
+    def __dlpack_device__(self):
+        if not runtime._is_hip_environment:
+            attrs = runtime.pointerGetAttributes(self.data.ptr)
+            is_managed = (
+                attrs.type == runtime.memoryTypeManaged
+                and _util.DLPACK_EXPORT_VERSION >= (0, 6))
+            if is_managed:
+                device_type = dlpack.managed_CUDA
+            else:
+                device_type = dlpack.device_CUDA
+        else:
+            device_type = dlpack.device_ROCM
+        return (device_type, self.device.id)
+
+    # The definition order of attributes and methods are borrowed from the
+    # order of documentation at the following NumPy document.
+    # https://numpy.org/doc/stable/reference/arrays.ndarray.html
+
+    # -------------------------------------------------------------------------
+    # Memory layout
+    # -------------------------------------------------------------------------
+    @property
+    def flags(self):
+        """Object containing memory-layout information.
+
+        It only contains ``c_contiguous``, ``f_contiguous``, and ``owndata``
+        attributes. All of these are read-only. Accessing by indexes is also
+        supported.
+
+        .. seealso:: :attr:`numpy.ndarray.flags`
+
+        """
+        return flags.Flags(self._c_contiguous, self._f_contiguous,
+                           self.base is None)
+
+    property shape:
+        """Lengths of axes.
+
+        Setter of this property involves reshaping without copy. If the array
+        cannot be reshaped without copy, it raises an exception.
+
+        .. seealso: :attr:`numpy.ndarray.shape`
+
+        """
+
+        def __get__(self):
+            return tuple(self._shape)
+
+        def __set__(self, newshape):
+            _manipulation._ndarray_shape_setter(self, newshape)
+
+    @property
+    def strides(self):
+        """Strides of axes in bytes.
+
+        .. seealso:: :attr:`numpy.ndarray.strides`
+
+        """
+        return tuple(self._strides)
+
+    @property
+    def ndim(self):
+        """Number of dimensions.
+
+        ``a.ndim`` is equivalent to ``len(a.shape)``.
+
+        .. seealso:: :attr:`numpy.ndarray.ndim`
+
+        """
+        return self._shape.size()
+
+    @property
+    def itemsize(self):
+        """Size of each element in bytes.
+
+        .. seealso:: :attr:`numpy.ndarray.itemsize`
+
+        """
+        return self.dtype.itemsize
+
+    @property
+    def nbytes(self):
+        """Total size of all elements in bytes.
+
+        It does not count skips between elements.
+
+        .. seealso:: :attr:`numpy.ndarray.nbytes`
+
+        """
+        return self.size * self.dtype.itemsize
+
+    # -------------------------------------------------------------------------
+    # Other attributes
+    # -------------------------------------------------------------------------
+    @property
+    def T(self):
+        """Shape-reversed view of the array.
+
+        If ndim < 2, then this is just a reference to the array itself.
+
+        """
+        if self.ndim < 2:
+            return self
+        else:
+            return _manipulation._T(self)
+
+    @property
+    def flat(self):
+        return cupy.flatiter(self)
+
+    __array_priority__ = 100
+
+    # -------------------------------------------------------------------------
+    # Array interface
+    # -------------------------------------------------------------------------
+    # TODO(beam2d): Implement __array_interface__
+
+    # -------------------------------------------------------------------------
+    # foreign function interface
+    # -------------------------------------------------------------------------
+    @property
+    def cstruct(self):
+        """C representation of the array.
+
+        This property is used for sending an array to CUDA kernels. The type of
+        returned C structure is different for different dtypes and ndims. The
+        definition of C type is written in ``cupy/carray.cuh``.
+
+        """
+        return _CArray_from_ndarray(self)
+
+    # -------------------------------------------------------------------------
+    # Array conversion
+    # -------------------------------------------------------------------------
+    cpdef item(self):
+        """Converts the array with one element to a Python scalar
+
+        Returns:
+            int or float or complex: The element of the array.
+
+        .. seealso:: :meth:`numpy.ndarray.item`
+
+        """
+        if self.size != 1:
+            raise ValueError(
+                'can only convert an array of size 1 to a Python scalar')
+        return self.get().item()
+
+    cpdef tolist(self):
+        """Converts the array to a (possibly nested) Python list.
+
+        Returns:
+            list: The possibly nested Python list of array elements.
+
+        .. seealso:: :meth:`numpy.ndarray.tolist`
+
+        """
+        return self.get().tolist()
+
+    # TODO(okuta): Implement itemset
+    # TODO(okuta): Implement tostring
+
+    cpdef bytes tobytes(self, order='C'):
+        """Turns the array into a Python bytes object."""
+        return self.get().tobytes(order)
+
+    cpdef tofile(self, fid, sep='', format='%s'):
+        """Writes the array to a file.
+
+        .. seealso:: :meth:`numpy.ndarray.tofile`
+
+        """
+        self.get().tofile(fid, sep, format)
+
+    cpdef dump(self, file):
+        """Dumps a pickle of the array to a file.
+
+        Dumped file can be read back to :class:`cupy.ndarray` by
+        :func:`cupy.load`.
+
+        """
+        pickle.dump(self, file, -1)
+
+    cpdef bytes dumps(self):
+        """Dumps a pickle of the array to a string."""
+        return pickle.dumps(self, -1)
+
+    cpdef _ndarray_base astype(
+            self, dtype, order='K', casting=None, subok=None, copy=True):
+        """Casts the array to given data type.
+
+        Args:
+            dtype: Type specifier.
+            order ({'C', 'F', 'A', 'K'}): Row-major (C-style) or column-major
+                (Fortran-style) order.
+                When ``order`` is 'A', it uses 'F' if ``a`` is column-major and
+                uses 'C' otherwise.
+                And when ``order`` is 'K', it keeps strides as closely as
+                possible.
+            copy (bool): If it is False and no cast happens, then this method
+                returns the array itself. Otherwise, a copy is returned.
+
+        Returns:
+            If ``copy`` is False and no cast is required, then the array itself
+            is returned. Otherwise, it returns a (possibly casted) copy of the
+            array.
+
+        .. note::
+           This method currently does not support ``casting``, and ``subok``
+           arguments.
+
+        .. seealso:: :meth:`numpy.ndarray.astype`
+
+        """
+        cdef strides_t strides
+
+        # TODO(beam2d): Support casting and subok option
+        if casting is not None:
+            raise TypeError('casting is not supported yet')
+        if subok is not None:
+            raise TypeError('subok is not supported yet')
+
+        if order is None:
+            order = 'K'
+        cdef int order_char = internal._normalize_order(order)
+
+        dtype = get_dtype(dtype)
+        if dtype == self.dtype:
+            if not copy and (
+                    order_char == b'K' or
+                    order_char == b'A' and (self._c_contiguous or
+                                            self._f_contiguous) or
+                    order_char == b'C' and self._c_contiguous or
+                    order_char == b'F' and self._f_contiguous):
+                return self
+
+        order_char = internal._update_order_char(
+            self._c_contiguous, self._f_contiguous, order_char)
+
+        if order_char == b'K':
+            strides = internal._get_strides_for_order_K(self, dtype)
+            newarray = _ndarray_init(ndarray, self._shape, dtype, None)
+            # TODO(niboshi): Confirm update_x_contiguity flags
+            newarray._set_shape_and_strides(self._shape, strides, True, True)
+        else:
+            newarray = ndarray(self.shape, dtype=dtype, order=chr(order_char))
+
+        if self.size == 0:
+            # skip copy
+            if self.dtype.kind == 'c' and newarray.dtype.kind not in 'bc':
+                warnings.warn(
+                    'Casting complex values to real discards the imaginary '
+                    'part',
+                    numpy.ComplexWarning)
+        else:
+            elementwise_copy(self, newarray)
+        return newarray
+
+    # TODO(okuta): Implement byteswap
+
+    cpdef _ndarray_base copy(self, order='C'):
+        """Returns a copy of the array.
+
+        This method makes a copy of a given array in the current device.
+        Even when a given array is located in another device, you can copy it
+        to the current device.
+
+        Args:
+            order ({'C', 'F', 'A', 'K'}): Row-major (C-style) or column-major
+                (Fortran-style) order.
+                When ``order`` is 'A', it uses 'F' if ``a`` is column-major and
+                uses 'C' otherwise.
+                And when `order` is 'K', it keeps strides as closely as
+                possible.
+
+        .. seealso::
+           :func:`cupy.copy` for full documentation,
+           :meth:`numpy.ndarray.copy`
+
+        """
+        cdef _ndarray_base x
+        if self.size == 0:
+            return self.astype(self.dtype, order=order)
+
+        dev_id = device.get_device_id()
+        if self.data.device_id == dev_id:
+            return self.astype(self.dtype, order=order)
+
+        # It need to make a contiguous copy for copying from another device
+        prev_device = runtime.getDevice()
+        try:
+            runtime.setDevice(self.device.id)
+            x = self.astype(self.dtype, order=order, copy=False)
+        finally:
+            runtime.setDevice(prev_device)
+        newarray = _ndarray_init(ndarray, x._shape, x.dtype, None)
+        if not x._c_contiguous and not x._f_contiguous:
+            raise NotImplementedError(
+                'CuPy cannot copy non-contiguous array between devices.')
+        # TODO(niboshi): Confirm update_x_contiguity flags
+        newarray._strides = x._strides
+        newarray._c_contiguous = x._c_contiguous
+        newarray._f_contiguous = x._f_contiguous
+
+        copy_context = _null_context
+        if runtime._is_hip_environment:
+            # HIP requires changing the active device to the one where
+            # src data is before the copy. From the docs:
+            # it is recommended to set the current device to the device
+            # where the src data is physically located.
+            copy_context = self.device
+        with copy_context:
+            newarray.data.copy_from_device_async(x.data, x.nbytes)
+        return newarray
+
+    cpdef _ndarray_base view(self, dtype=None, array_class=None):
+        cdef Py_ssize_t ndim, axis, tmp_size
+        cdef int self_is, v_is
+
+        if dtype is not None:
+            if type(dtype) is type and issubclass(dtype, ndarray):
+                if array_class is not None:
+                    raise ValueError('Cannot specify output type twice.')
+                array_class = dtype
+                dtype = None
+
+        if (
+            array_class is not None and (
+                type(array_class) is not type or
+                not issubclass(array_class, ndarray)
+            )
+        ):
+            raise ValueError('Type must be a sub-type of ndarray type')
+
+        if array_class is None:
+            array_class = type(self)
+
+        v = self._view(
+            array_class, self._shape, self._strides, False, False, self)
+        if dtype is None:
+            return v
+
+        v.dtype, v_is = _dtype.get_dtype_with_itemsize(dtype)
+        self_is = self.dtype.itemsize
+        if v_is == self_is:
+            return v
+
+        ndim = self._shape.size()
+        if ndim == 0:
+            raise ValueError(
+                'Changing the dtype of a 0d array is only supported if '
+                'the itemsize is unchanged')
+        axis = ndim - 1
+        if (
+            self._shape[axis] != 1
+            and self.size != 0
+            and self._strides[axis] != self.dtype.itemsize
+        ):
+            raise ValueError(
+                'To change to a dtype of a different size, the last axis '
+                'must be contiguous')
+
+        # Normalize `_strides[axis]` whenever itemsize changes
+        v._strides[axis] = v_is
+
+        tmp_size = v._shape[axis] * self_is
+        if tmp_size % v_is != 0:
+            raise ValueError(
+                'When changing to a larger dtype, its size must be a '
+                'divisor of the total size in bytes of the last axis '
+                'of the array.')
+            # itemsize of dtype in CuPy is one of 1, 2, 4, 8, 16.
+            # Thus, CuPy does not raise the following:
+            # raise ValueError(
+            #     'When changing to a smaller dtype, its size must be a '
+            #     'divisor of the size of original dtype')
+        v._shape[axis] = tmp_size // v_is
+        v.size = v.size * self_is // v_is  # divisible because shape[axis] is.
+
+        if axis != ndim - 1:
+            v._update_c_contiguity()
+        if axis != 0:
+            v._update_f_contiguity()
+        return v
+
+    # TODO(okuta): Implement getfield
+    # TODO(okuta): Implement setflags
+
+    cpdef fill(self, value):
+        """Fills the array with a scalar value.
+
+        Args:
+            value: A scalar value to fill the array content.
+
+        .. seealso:: :meth:`numpy.ndarray.fill`
+
+        """
+        if isinstance(value, cupy.ndarray):
+            if value.shape != ():
+                raise ValueError(
+                    'non-scalar cupy.ndarray cannot be used for fill')
+            value = value.astype(self.dtype, copy=False)
+            fill_kernel(value, self)
+            return
+
+        if isinstance(value, numpy.ndarray):
+            if value.shape != ():
+                raise ValueError(
+                    'non-scalar numpy.ndarray cannot be used for fill')
+            value = value.astype(self.dtype, copy=False).item()
+
+        if value == 0 and self._c_contiguous:
+            self.data.memset_async(0, self.nbytes)
+        else:
+            fill_kernel(value, self)
+
+    # -------------------------------------------------------------------------
+    # Shape manipulation
+    # -------------------------------------------------------------------------
+    def reshape(self, *shape, order='C'):
+        """Returns an array of a different shape and the same content.
+
+        .. seealso::
+           :func:`cupy.reshape` for full documentation,
+           :meth:`numpy.ndarray.reshape`
+
+        """
+        return _manipulation._ndarray_reshape(self, shape, order)
+
+    # TODO(okuta): Implement resize
+
+    def transpose(self, *axes):
+        """Returns a view of the array with axes permuted.
+
+        .. seealso::
+           :func:`cupy.transpose` for full documentation,
+           :meth:`numpy.ndarray.reshape`
+
+        """
+        return _manipulation._ndarray_transpose(self, axes)
+
+    cpdef _ndarray_base swapaxes(self, Py_ssize_t axis1, Py_ssize_t axis2):
+        """Returns a view of the array with two axes swapped.
+
+        .. seealso::
+           :func:`cupy.swapaxes` for full documentation,
+           :meth:`numpy.ndarray.swapaxes`
+
+        """
+        return _manipulation._ndarray_swapaxes(self, axis1, axis2)
+
+    cpdef _ndarray_base flatten(self, order='C'):
+        """Returns a copy of the array flatten into one dimension.
+
+        Args:
+            order ({'C', 'F', 'A', 'K'}):
+                'C' means to flatten in row-major (C-style) order.
+                'F' means to flatten in column-major (Fortran-
+                style) order. 'A' means to flatten in column-major
+                order if `self` is Fortran *contiguous* in memory,
+                row-major order otherwise. 'K' means to flatten
+                `self` in the order the elements occur in memory.
+                The default is 'C'.
+
+        Returns:
+            cupy.ndarray: A copy of the array with one dimension.
+
+        .. seealso:: :meth:`numpy.ndarray.flatten`
+
+        """
+        return _manipulation._ndarray_flatten(self, order)
+
+    cpdef _ndarray_base ravel(self, order='C'):
+        """Returns an array flattened into one dimension.
+
+        .. seealso::
+           :func:`cupy.ravel` for full documentation,
+           :meth:`numpy.ndarray.ravel`
+
+        """
+        return _internal_ascontiguousarray(
+            _manipulation._ndarray_ravel(self, order))
+
+    cpdef _ndarray_base squeeze(self, axis=None):
+        """Returns a view with size-one axes removed.
+
+        .. seealso::
+           :func:`cupy.squeeze` for full documentation,
+           :meth:`numpy.ndarray.squeeze`
+
+        """
+        return _manipulation._ndarray_squeeze(self, axis)
+
+    # -------------------------------------------------------------------------
+    # Item selection and manipulation
+    # -------------------------------------------------------------------------
+    cpdef _ndarray_base take(self, indices, axis=None, out=None):
+        """Returns an array of elements at given indices along the axis.
+
+        .. seealso::
+           :func:`cupy.take` for full documentation,
+           :meth:`numpy.ndarray.take`
+
+        """
+        return _indexing._ndarray_take(self, indices, axis, out)
+
+    cpdef put(self, indices, values, mode='wrap'):
+        """Replaces specified elements of an array with given values.
+
+        .. seealso::
+           :func:`cupy.put` for full documentation,
+           :meth:`numpy.ndarray.put`
+        """
+        return _indexing._ndarray_put(self, indices, values, mode)
+
+    cpdef repeat(self, repeats, axis=None):
+        """Returns an array with repeated arrays along an axis.
+
+        .. seealso::
+            :func:`cupy.repeat` for full documentation,
+            :meth:`numpy.ndarray.repeat`
+
+        """
+        return _manipulation._ndarray_repeat(self, repeats, axis)
+
+    cpdef choose(self, choices, out=None, mode='raise'):
+        # TODO(niboshi): Write docstring
+        return _indexing._ndarray_choose(self, choices, out, mode)
+
+    cpdef sort(self, int axis=-1):
+        """Sort an array, in-place with a stable sorting algorithm.
+
+        Args:
+            axis (int): Axis along which to sort. Default is -1, which means
+                sort along the last axis.
+
+        .. note::
+           For its implementation reason, ``ndarray.sort`` currently supports
+           only arrays with their own data, and does not support ``kind`` and
+           ``order`` parameters that ``numpy.ndarray.sort`` does support.
+
+        .. seealso::
+            :func:`cupy.sort` for full documentation,
+            :meth:`numpy.ndarray.sort`
+
+        """
+        # TODO(takagi): Support kind argument.
+        _sorting._ndarray_sort(self, axis)
+
+    cpdef _ndarray_base argsort(self, axis=-1):
+        """Returns the indices that would sort an array with stable sorting
+
+        Args:
+            axis (int or None): Axis along which to sort. Default is -1, which
+                means sort along the last axis. If None is supplied, the array
+                is flattened before sorting.
+
+        Returns:
+            cupy.ndarray: Array of indices that sort the array.
+
+        .. seealso::
+            :func:`cupy.argsort` for full documentation,
+            :meth:`numpy.ndarray.argsort`
+
+        """
+        # TODO(takagi): Support kind argument.
+        return _sorting._ndarray_argsort(self, axis)
+
+    cpdef partition(self, kth, int axis=-1):
+        """Partitions an array.
+
+        Args:
+            kth (int or sequence of ints): Element index to partition by. If
+                supplied with a sequence of k-th it will partition all elements
+                indexed by k-th of them into their sorted position at once.
+
+            axis (int): Axis along which to sort. Default is -1, which means
+                sort along the last axis.
+
+        .. seealso::
+            :func:`cupy.partition` for full documentation,
+            :meth:`numpy.ndarray.partition`
+
+        """
+        _sorting._ndarray_partition(self, kth, axis)
+
+    cpdef _ndarray_base argpartition(self, kth, axis=-1):
+        """Returns the indices that would partially sort an array.
+
+        Args:
+            kth (int or sequence of ints): Element index to partition by. If
+                supplied with a sequence of k-th it will partition all elements
+                indexed by k-th of them into their sorted position at once.
+            axis (int or None): Axis along which to sort. Default is -1, which
+                means sort along the last axis. If None is supplied, the array
+                is flattened before sorting.
+
+        Returns:
+            cupy.ndarray: Array of the same type and shape as ``a``.
+
+        .. seealso::
+            :func:`cupy.argpartition` for full documentation,
+            :meth:`numpy.ndarray.argpartition`
+
+        """
+        return _sorting._ndarray_argpartition(self, kth, axis)
+
+    def searchsorted(self, v, side='left', sorter=None):
+        """Finds indices where elements of v should be inserted to maintain order.
+
+        For full documentation, see :func:`cupy.searchsorted`
+
+        Returns:
+
+        .. seealso:: :func:`numpy.searchsorted`
+
+        """  # NOQA
+        return cupy.searchsorted(self, v, side, sorter)
+
+    cpdef tuple nonzero(self):
+        """Return the indices of the elements that are non-zero.
+
+        Returned Array is containing the indices of the non-zero elements
+        in that dimension.
+
+        Returns:
+            tuple of arrays: Indices of elements that are non-zero.
+
+        .. warning::
+
+            This function may synchronize the device.
+
+        .. seealso::
+            :func:`numpy.nonzero`
+
+        """
+        return _indexing._ndarray_nonzero(self)
+
+    cpdef _ndarray_base compress(self, condition, axis=None, out=None):
+        """Returns selected slices of this array along given axis.
+
+        .. warning::
+
+            This function may synchronize the device.
+
+        .. seealso::
+           :func:`cupy.compress` for full documentation,
+           :meth:`numpy.ndarray.compress`
+
+        """
+        return _indexing._ndarray_compress(self, condition, axis, out)
+
+    cpdef _ndarray_base diagonal(self, offset=0, axis1=0, axis2=1):
+        """Returns a view of the specified diagonals.
+
+        .. seealso::
+           :func:`cupy.diagonal` for full documentation,
+           :meth:`numpy.ndarray.diagonal`
+
+        """
+        return _indexing._ndarray_diagonal(self, offset, axis1, axis2)
+
+    # -------------------------------------------------------------------------
+    # Calculation
+    # -------------------------------------------------------------------------
+    cpdef _ndarray_base max(self, axis=None, out=None, keepdims=False):
+        """Returns the maximum along a given axis.
+
+        .. seealso::
+           :func:`cupy.amax` for full documentation,
+           :meth:`numpy.ndarray.max`
+
+        """
+        return _statistics._ndarray_max(self, axis, out, None, keepdims)
+
+    cpdef _ndarray_base argmax(
+            self, axis=None, out=None, dtype=None, keepdims=False):
+        """Returns the indices of the maximum along a given axis.
+
+        .. note::
+           ``dtype`` and ``keepdim`` arguments are specific to CuPy. They are
+           not in NumPy.
+
+        .. note::
+           ``axis`` argument accepts a tuple of ints, but this is specific to
+           CuPy. NumPy does not support it.
+
+        .. seealso::
+           :func:`cupy.argmax` for full documentation,
+           :meth:`numpy.ndarray.argmax`
+
+        """
+        return _statistics._ndarray_argmax(self, axis, out, dtype, keepdims)
+
+    cpdef _ndarray_base min(self, axis=None, out=None, keepdims=False):
+        """Returns the minimum along a given axis.
+
+        .. seealso::
+           :func:`cupy.amin` for full documentation,
+           :meth:`numpy.ndarray.min`
+
+        """
+        return _statistics._ndarray_min(self, axis, out, None, keepdims)
+
+    cpdef _ndarray_base argmin(
+            self, axis=None, out=None, dtype=None, keepdims=False):
+        """Returns the indices of the minimum along a given axis.
+
+        .. note::
+           ``dtype`` and ``keepdim`` arguments are specific to CuPy. They are
+           not in NumPy.
+
+        .. note::
+           ``axis`` argument accepts a tuple of ints, but this is specific to
+           CuPy. NumPy does not support it.
+
+        .. seealso::
+           :func:`cupy.argmin` for full documentation,
+           :meth:`numpy.ndarray.argmin`
+
+        """
+        return _statistics._ndarray_argmin(self, axis, out, dtype, keepdims)
+
+    cpdef _ndarray_base ptp(self, axis=None, out=None, keepdims=False):
+        """Returns (maximum - minimum) along a given axis.
+
+        .. seealso::
+           :func:`cupy.ptp` for full documentation,
+           :meth:`numpy.ndarray.ptp`
+
+        """
+        return _statistics._ndarray_ptp(self, axis, out, keepdims)
+
+    cpdef _ndarray_base clip(self, min=None, max=None, out=None):
+        """Returns an array with values limited to [min, max].
+
+        .. seealso::
+           :func:`cupy.clip` for full documentation,
+           :meth:`numpy.ndarray.clip`
+
+        """
+        return _math._ndarray_clip(self, min, max, out)
+
+    cpdef _ndarray_base round(self, decimals=0, out=None):
+        """Returns an array with values rounded to the given number of decimals.
+
+        .. seealso::
+           :func:`cupy.around` for full documentation,
+           :meth:`numpy.ndarray.round`
+
+        """  # NOQA
+        return _round_ufunc(self, decimals, out=out)
+
+    cpdef _ndarray_base trace(
+            self, offset=0, axis1=0, axis2=1, dtype=None, out=None):
+        """Returns the sum along diagonals of the array.
+
+        .. seealso::
+           :func:`cupy.trace` for full documentation,
+           :meth:`numpy.ndarray.trace`
+
+        """
+        d = self.diagonal(offset, axis1, axis2)
+        return d.sum(-1, dtype, out, False)
+
+    cpdef _ndarray_base sum(
+            self, axis=None, dtype=None, out=None, keepdims=False):
+        """Returns the sum along a given axis.
+
+        .. seealso::
+           :func:`cupy.sum` for full documentation,
+           :meth:`numpy.ndarray.sum`
+
+        """
+        return _math._ndarray_sum(self, axis, dtype, out, keepdims)
+
+    cpdef _ndarray_base cumsum(self, axis=None, dtype=None, out=None):
+        """Returns the cumulative sum of an array along a given axis.
+
+        .. seealso::
+           :func:`cupy.cumsum` for full documentation,
+           :meth:`numpy.ndarray.cumsum`
+
+        """
+        return _math._ndarray_cumsum(self, axis, dtype, out)
+
+    cpdef _ndarray_base mean(
+            self, axis=None, dtype=None, out=None, keepdims=False):
+        """Returns the mean along a given axis.
+
+        .. seealso::
+           :func:`cupy.mean` for full documentation,
+           :meth:`numpy.ndarray.mean`
+
+        """
+        return _statistics._ndarray_mean(self, axis, dtype, out, keepdims)
+
+    cpdef _ndarray_base var(
+            self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
+        """Returns the variance along a given axis.
+
+        .. seealso::
+           :func:`cupy.var` for full documentation,
+           :meth:`numpy.ndarray.var`
+
+        """
+        return _statistics._ndarray_var(
+            self, axis, dtype, out, ddof, keepdims)
+
+    cpdef _ndarray_base std(
+            self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
+        """Returns the standard deviation along a given axis.
+
+        .. seealso::
+           :func:`cupy.std` for full documentation,
+           :meth:`numpy.ndarray.std`
+
+        """
+        return _statistics._ndarray_std(self, axis, dtype, out, ddof, keepdims)
+
+    cpdef _ndarray_base prod(
+            self, axis=None, dtype=None, out=None, keepdims=None):
+        """Returns the product along a given axis.
+
+        .. seealso::
+           :func:`cupy.prod` for full documentation,
+           :meth:`numpy.ndarray.prod`
+
+        """
+        return _math._ndarray_prod(self, axis, dtype, out, keepdims)
+
+    cpdef _ndarray_base cumprod(self, axis=None, dtype=None, out=None):
+        """Returns the cumulative product of an array along a given axis.
+
+        .. seealso::
+           :func:`cupy.cumprod` for full documentation,
+           :meth:`numpy.ndarray.cumprod`
+
+        """
+        return _math._ndarray_cumprod(self, axis, dtype, out)
+
+    cpdef _ndarray_base _add_reduceat(self, indices, axis, dtype, out):
+        return _indexing._add_reduceat(self, indices, axis, dtype, out)
+
+    cpdef _ndarray_base all(self, axis=None, out=None, keepdims=False):
+        # TODO(niboshi): Write docstring
+        return _logic._ndarray_all(self, axis, out, keepdims)
+
+    cpdef _ndarray_base any(self, axis=None, out=None, keepdims=False):
+        # TODO(niboshi): Write docstring
+        return _logic._ndarray_any(self, axis, out, keepdims)
+
+    # -------------------------------------------------------------------------
+    # Arithmetic and comparison operations
+    # -------------------------------------------------------------------------
+    # Comparison operators:
+
+    def __richcmp__(object self, object other, int op):
+        if isinstance(other, ndarray):
+            if op == 0:
+                return _logic._ndarray_less(self, other)
+            if op == 1:
+                return _logic._ndarray_less_equal(self, other)
+            if op == 2:
+                return _logic._ndarray_equal(self, other)
+            if op == 3:
+                return _logic._ndarray_not_equal(self, other)
+            if op == 4:
+                return _logic._ndarray_greater(self, other)
+            if op == 5:
+                return _logic._ndarray_greater_equal(self, other)
+        elif not _should_use_rop(self, other):
+            if isinstance(other, numpy.ndarray) and other.ndim == 0:
+                other = other.item()  # Workaround for numpy<1.13
+            if op == 0:
+                return numpy.less(self, other)
+            if op == 1:
+                return numpy.less_equal(self, other)
+            if op == 2:
+                # cupy.ndarray does not support dtype=object, but
+                # allow comparison with None, Ellipsis, and etc.
+                if type(other).__eq__ is object.__eq__:
+                    # Implies `other` is neither (Python/NumPy) scalar nor
+                    # ndarray. With object's default __eq__, it never
+                    # equals to an element of cupy.ndarray.
+                    return cupy.zeros(self._shape, dtype=cupy.bool_)
+                return numpy.equal(self, other)
+            if op == 3:
+                if (
+                    type(other).__eq__ is object.__eq__
+                    and type(other).__ne__ is object.__ne__
+                ):
+                    # Similar to eq, but ne falls back to `not __eq__`.
+                    return cupy.ones(self._shape, dtype=cupy.bool_)
+                return numpy.not_equal(self, other)
+            if op == 4:
+                return numpy.greater(self, other)
+            if op == 5:
+                return numpy.greater_equal(self, other)
+        return NotImplemented
+
+    # Truth value of an array (bool):
+
+    def __nonzero__(self):
+        if self.size == 0:
+            msg = ('The truth value of an empty array is ambiguous. Returning '
+                   'False, but in future this will result in an error. Use '
+                   '`array.size > 0` to check that an array is not empty.')
+            warnings.warn(msg, DeprecationWarning)
+            return False
+        elif self.size == 1:
+            return bool(self.get())
+        else:
+            msg = ('The truth value of an array with more than one element is '
+                   'ambiguous. Use a.any() or a.all()')
+            raise ValueError(msg)
+
+    # Unary operations:
+
+    def __neg__(self):
+        return _math._negative(self)
+
+    def __pos__(self):
+        if self.dtype == numpy.bool_:
+            msg = ("Applying '+' to a non-numerical array is ill-defined. "
+                   'Returning a copy, but in the future this will error.')
+            warnings.warn(msg, DeprecationWarning)
+            return self.copy()
+        return _math._positive(self)
+
+    def __abs__(self):
+        return _math._absolute(self)
+
+    def __invert__(self):
+        return _binary._invert(self)
+
+    # Arithmetic:
+
+    def __add__(x, y):
+        if isinstance(y, ndarray):
+            return _math._add(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.add(x, y)
+
+    def __sub__(x, y):
+        if isinstance(y, ndarray):
+            return _math._subtract(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.subtract(x, y)
+
+    def __mul__(x, y):
+        if isinstance(y, ndarray):
+            return _math._multiply(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.multiply(x, y)
+
+    def __matmul__(x, y):
+        if isinstance(y, ndarray):
+            return _linalg.matmul(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.matmul(x, y)
+
+    def __div__(x, y):
+        if isinstance(y, ndarray):
+            return _math._divide(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.divide(x, y)
+
+    def __truediv__(x, y):
+        if isinstance(y, ndarray):
+            return _math._true_divide(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.true_divide(x, y)
+
+    def __floordiv__(x, y):
+        if isinstance(y, ndarray):
+            return _math._floor_divide(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.floor_divide(x, y)
+
+    def __mod__(x, y):
+        if isinstance(y, ndarray):
+            return _math._remainder(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.remainder(x, y)
+
+    def __divmod__(x, y):
+        if isinstance(y, ndarray):
+            return divmod(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.divmod(x, y)
+
+    def __pow__(x, y, modulo):
+        # Note that we ignore the modulo argument as well as NumPy.
+        if isinstance(y, ndarray):
+            return _math._power(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.power(x, y)
+
+    def __lshift__(x, y):
+        if isinstance(y, ndarray):
+            return _binary._left_shift(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.left_shift(x, y)
+
+    def __rshift__(x, y):
+        if isinstance(y, ndarray):
+            return _binary._right_shift(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.right_shift(x, y)
+
+    def __and__(x, y):
+        if isinstance(y, ndarray):
+            return _binary._bitwise_and(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.bitwise_and(x, y)
+
+    def __or__(x, y):
+        if isinstance(y, ndarray):
+            return _binary._bitwise_or(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.bitwise_or(x, y)
+
+    def __xor__(x, y):
+        if isinstance(y, ndarray):
+            return _binary._bitwise_xor(x, y)
+        elif _should_use_rop(x, y):
+            return NotImplemented
+        else:
+            return numpy.bitwise_xor(x, y)
+
+    # Arithmetic, in-place:
+
+    def __iadd__(self, other):
+        return _math._add(self, other, self)
+
+    def __isub__(self, other):
+        return _math._subtract(self, other, self)
+
+    def __imul__(self, other):
+        return _math._multiply(self, other, self)
+
+    def __idiv__(self, other):
+        return _math._divide(self, other, self)
+
+    def __itruediv__(self, other):
+        return _math._true_divide(self, other, self)
+
+    def __ifloordiv__(self, other):
+        return _math._floor_divide(self, other, self)
+
+    def __imod__(self, other):
+        return _math._remainder(self, other, self)
+
+    def __ipow__(self, other):
+        return _math._power(self, other, self)
+
+    def __ilshift__(self, other):
+        return _binary._left_shift(self, other, self)
+
+    def __irshift__(self, other):
+        return _binary._right_shift(self, other, self)
+
+    def __iand__(self, other):
+        return _binary._bitwise_and(self, other, self)
+
+    def __ior__(self, other):
+        return _binary._bitwise_or(self, other, self)
+
+    def __ixor__(self, other):
+        return _binary._bitwise_xor(self, other, self)
+
+    cpdef _ndarray_base conj(self):
+        return _math._ndarray_conj(self)
+
+    cpdef _ndarray_base conjugate(self):
+        return _math._ndarray_conj(self)
+
+    @property
+    def real(self):
+        return _math._ndarray_real_getter(self)
+
+    @real.setter
+    def real(self, value):
+        _math._ndarray_real_setter(self, value)
+
+    @property
+    def imag(self):
+        return _math._ndarray_imag_getter(self)
+
+    @imag.setter
+    def imag(self, value):
+        _math._ndarray_imag_setter(self, value)
+
+    # -------------------------------------------------------------------------
+    # Special methods
+    # -------------------------------------------------------------------------
+    # For standard library functions:
+
+    def __copy__(self):
+        return self.copy()
+
+    def __deepcopy__(self, memo):
+        # It need to make a contiguous copy for copying from another device
+        prev_device = runtime.getDevice()
+        try:
+            runtime.setDevice(self.device.id)
+            return self.copy()
+        finally:
+            runtime.setDevice(prev_device)
+
+    def __reduce__(self):
+        return array, (self.get(),)
+
+    # Basic customization:
+
+    # _ndarray_base does not define __new__
+
+    def __array__(self, dtype=None):
+        # TODO(imanishi): Support an environment variable or a global
+        # configure flag that allows implicit conversions to NumPy array.
+        # (See https://github.com/cupy/cupy/issues/589 for the detail.)
+        raise TypeError(
+            'Implicit conversion to a NumPy array is not allowed. '
+            'Please use `.get()` to construct a NumPy array explicitly.')
+
+    @classmethod
+    def __class_getitem__(cls, tuple item):
+        from cupy.typing._generic_alias import GenericAlias
+        item1, item2 = item
+        return GenericAlias(cupy.ndarray, (item1, item2))
+
+    # TODO(okuta): Implement __array_wrap__
+
+    # Container customization:
+
+    def __iter__(self):
+        if self._shape.size() == 0:
+            raise TypeError('iteration over a 0-d array')
+        return (self[i] for i in range(self._shape[0]))
+
+    def __len__(self):
+        if self._shape.size() == 0:
+            raise TypeError('len() of unsized object')
+        return self._shape[0]
+
+    def __getitem__(self, slices):
+        """x.__getitem__(y) <==> x[y]
+
+        Supports both basic and advanced indexing.
+
+        .. note::
+
+            Currently, it does not support ``slices`` that consists of more
+            than one boolean arrays
+
+        .. note::
+
+           CuPy handles out-of-bounds indices differently from NumPy.
+           NumPy handles them by raising an error, but CuPy wraps around them.
+
+        Example:
+
+            >>> a = cupy.arange(3)
+            >>> a[[1, 3]]
+            array([1, 0])
+
+        """
+        return _indexing._ndarray_getitem(self, slices)
+
+    def __setitem__(self, slices, value):
+        """x.__setitem__(slices, y) <==> x[slices] = y
+
+        Supports both basic and advanced indexing.
+
+        .. note::
+
+            Currently, it does not support ``slices`` that consists of more
+            than one boolean arrays
+
+        .. note::
+
+            CuPy handles out-of-bounds indices differently from NumPy when
+            using integer array indexing.
+            NumPy handles them by raising an error, but CuPy wraps around them.
+
+            >>> import cupy
+            >>> x = cupy.arange(3)
+            >>> x[[1, 3]] = 10
+            >>> x
+            array([10, 10,  2])
+
+        .. note::
+
+            The behavior differs from NumPy when integer arrays in ``slices``
+            reference the same location multiple times.
+            In that case, the value that is actually stored is undefined.
+
+            >>> import cupy
+            >>> a = cupy.zeros((2,))
+            >>> i = cupy.arange(10000) % 2
+            >>> v = cupy.arange(10000).astype(cupy.float_)
+            >>> a[i] = v
+            >>> a  # doctest: +SKIP
+            array([9150., 9151.])
+
+            On the other hand, NumPy stores the value corresponding to the
+            last index among the indices referencing duplicate locations.
+
+            >>> import numpy
+            >>> a_cpu = numpy.zeros((2,))
+            >>> i_cpu = numpy.arange(10000) % 2
+            >>> v_cpu = numpy.arange(10000).astype(numpy.float_)
+            >>> a_cpu[i_cpu] = v_cpu
+            >>> a_cpu
+            array([9998., 9999.])
+
+        """
+        if _util.ENABLE_SLICE_COPY and (
+                type(slices) is slice
+                and slices == slice(None, None, None)
+                and isinstance(value, numpy.ndarray)
+        ):
+            if (self.dtype == value.dtype
+                    and self.shape == value.shape
+                    and (self._f_contiguous or self._c_contiguous)):
+                order = 'F' if self._f_contiguous else 'C'
+                tmp = value.ravel(order)
+                ptr = tmp.ctypes.data
+                stream_ptr = stream_module.get_current_stream_ptr()
+                if stream_ptr == 0:
+                    self.data.copy_from_host(ptr, self.nbytes)
+                else:
+                    self.data.copy_from_host_async(ptr, self.nbytes)
+            else:
+                raise ValueError(
+                    'copying a numpy.ndarray to a cupy.ndarray by empty slice '
+                    'assignment must ensure arrays have same shape and dtype')
+        else:
+            _indexing._ndarray_setitem(self, slices, value)
+
+    def scatter_add(self, slices, value):
+        """Adds given values to specified elements of an array.
+
+        .. seealso::
+            :func:`cupyx.scatter_add` for full documentation.
+
+        """
+        warnings.warn(
+            '`ndarray.scatter_add` is deprecated. '
+            'Please use `cupy.add.at` instead.',
+            DeprecationWarning)
+        self._scatter_op(slices, value, 'add')
+
+    def scatter_max(self, slices, value):
+        """Stores a maximum value of elements specified by indices to an array.
+
+        .. seealso::
+            :func:`cupyx.scatter_max` for full documentation.
+
+        """
+        warnings.warn(
+            '`ndarray.scatter_max` is deprecated '
+            'Please use `cupy.maximum.at` instead.',
+            DeprecationWarning)
+        self._scatter_op(slices, value, 'max')
+
+    def scatter_min(self, slices, value):
+        """Stores a minimum value of elements specified by indices to an array.
+
+        .. seealso::
+            :func:`cupyx.scatter_min` for full documentation.
+
+        """
+        warnings.warn(
+            '`ndarray.scatter_min` is deprecated '
+            'Please use `cupy.minimum.at` instead.',
+            DeprecationWarning)
+        self._scatter_op(slices, value, 'min')
+
+    def _scatter_op(self, slices, value, op):
+        _indexing._scatter_op(self, slices, value, op)
+
+    # TODO(okuta): Implement __getslice__
+    # TODO(okuta): Implement __setslice__
+    # TODO(okuta): Implement __contains__
+
+    # numpy/ufunc compat
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+
+        """Apply unary or binary ufunc to this array
+
+        If binary, only allow if second argument is another cupy ndarray or
+        a number, i.e., raise ValueError instead of silently converting a
+        numpy array.
+        """
+        import cupy  # top-level ufuncs
+        import cupyx.scipy.special  # special ufuncs
+        inout = inputs
+        if 'out' in kwargs:
+            # need to unfold tuple argument in kwargs
+            # TODO(ecastill) GUFuncs support more than one output
+            out = kwargs['out']
+            if len(out) != 1:
+                raise ValueError('The \'out\' parameter must have exactly one '
+                                 'array value')
+            inout += out
+            kwargs['out'] = out[0]
+
+        if method in (
+                '__call__', 'outer', 'at', 'reduce', 'accumulate', 'reduceat'
+        ):
+            name = ufunc.__name__
+            try:
+                func = getattr(cupy, name, None) or getattr(
+                    cupyx.scipy.special, name
+                )
+                if method != '__call__':
+                    func = getattr(func, method)
+            except AttributeError:
+                return NotImplemented
+            for x in inout:
+                # numpy.ndarray is handled and then TypeError is raised due to
+                # implicit host-to-device conversion.
+                # Except for numpy.ndarray, types should be supported by
+                # `_kernel._preprocess_args`.
+                check = (hasattr(x, '__cuda_array_interface__')
+                         or hasattr(x, '__cupy_get_ndarray__'))
+                if runtime._is_hip_environment and isinstance(x, ndarray):
+                    check = True
+                if (not check
+                        and not type(x) in _scalar.scalar_type_set
+                        and not isinstance(x, numpy.ndarray)):
+                    return NotImplemented
+            if name in [
+                    'greater', 'greater_equal', 'less', 'less_equal',
+                    'equal', 'not_equal']:
+                # workaround for numpy/numpy#12142
+                inputs = tuple([
+                    x.item()
+                    if isinstance(x, numpy.ndarray) and x.ndim == 0
+                    else x
+                    for x in inputs
+                ])
+            return func(*inputs, **kwargs)
+        else:
+            return NotImplemented
+
+    def __array_function__(self, func, types, args, kwargs):
+        try:
+            module = functools.reduce(
+                getattr, func.__module__.split('.')[1:], cupy)
+            cupy_func = getattr(module, func.__name__)
+        except AttributeError:
+            return NotImplemented
+        if cupy_func is func:
+            # avoid NumPy func
+            return NotImplemented
+        for t in types:
+            for handled_type in _HANDLED_TYPES:
+                if issubclass(t, handled_type):
+                    break
+            else:
+                return NotImplemented
+        return cupy_func(*args, **kwargs)
+
+    # Conversion:
+
+    def __int__(self):
+        return int(self.get())
+
+    def __float__(self):
+        return float(self.get())
+
+    def __complex__(self):
+        return complex(self.get())
+
+    def __oct__(self):
+        return oct(self.get())
+
+    def __hex__(self):
+        return hex(self.get())
+
+    def __bytes__(self):
+        return bytes(self.get())
+
+    # String representations:
+
+    def __repr__(self):
+        return repr(self.get())
+
+    def __str__(self):
+        return str(self.get())
+
+    def __format__(self, format_spec):
+        return format(self.get(), format_spec)
+
+    # -------------------------------------------------------------------------
+    # Methods outside of the ndarray main documentation
+    # -------------------------------------------------------------------------
+    def dot(self, _ndarray_base b, _ndarray_base out=None):
+        """Returns the dot product with given array.
+
+        .. seealso::
+           :func:`cupy.dot` for full documentation,
+           :meth:`numpy.ndarray.dot`
+
+        """
+        return _linalg.dot(self, b, out)
+
+    # -------------------------------------------------------------------------
+    # Cupy specific attributes and methods
+    # -------------------------------------------------------------------------
+    @property
+    def device(self):
+        """CUDA device on which this array resides."""
+        return self.data.device
+
+    cpdef get(self, stream=None, order='C', out=None):
+        """Returns a copy of the array on host memory.
+
+        Args:
+            stream (cupy.cuda.Stream): CUDA stream object. If it is given, the
+                copy runs asynchronously. Otherwise, the copy is synchronous.
+                The default uses CUDA stream object of the current context.
+            order ({'C', 'F', 'A'}): The desired memory layout of the host
+                array. When ``order`` is 'A', it uses 'F' if the array is
+                fortran-contiguous and 'C' otherwise. The ``order`` will be
+                ignored if ``out`` is specified.
+            out (numpy.ndarray): Output array. In order to enable asynchronous
+                copy, the underlying memory should be a pinned memory.
+
+        Returns:
+            numpy.ndarray: Copy of the array on host memory.
+
+        """
+        if out is not None:
+            if not isinstance(out, numpy.ndarray):
+                raise TypeError('Only numpy.ndarray can be obtained from'
+                                'cupy.ndarray')
+            if self.dtype != out.dtype:
+                raise TypeError(
+                    '{} array cannot be obtained from {} array'.format(
+                        out.dtype, self.dtype))
+            if self.shape != out.shape:
+                raise ValueError(
+                    'Shape mismatch. Expected shape: {}, '
+                    'actual shape: {}'.format(self.shape, out.shape))
+            if not (out.flags.c_contiguous and self._c_contiguous or
+                    out.flags.f_contiguous and self._f_contiguous):
+                prev_device = runtime.getDevice()
+                try:
+                    runtime.setDevice(self.device.id)
+                    if out.flags.c_contiguous:
+                        a_gpu = _internal_ascontiguousarray(self)
+                    elif out.flags.f_contiguous:
+                        a_gpu = _internal_asfortranarray(self)
+                    else:
+                        raise RuntimeError(
+                            '`out` cannot be specified when copying to '
+                            'non-contiguous ndarray')
+                finally:
+                    runtime.setDevice(prev_device)
+            else:
+                a_gpu = self
+            a_cpu = out
+        else:
+            if self.size == 0:
+                return numpy.ndarray(self._shape, dtype=self.dtype)
+
+            order = order.upper()
+            if order == 'A':
+                if self._f_contiguous:
+                    order = 'F'
+                else:
+                    order = 'C'
+            if not (order == 'C' and self._c_contiguous or
+                    order == 'F' and self._f_contiguous):
+                prev_device = runtime.getDevice()
+                try:
+                    runtime.setDevice(self.device.id)
+                    if order == 'C':
+                        a_gpu = _internal_ascontiguousarray(self)
+                    elif order == 'F':
+                        a_gpu = _internal_asfortranarray(self)
+                    else:
+                        raise ValueError('unsupported order: {}'.format(order))
+                finally:
+                    runtime.setDevice(prev_device)
+            else:
+                a_gpu = self
+            a_cpu = numpy.empty(self._shape, dtype=self.dtype, order=order)
+
+        syncdetect._declare_synchronize()
+        ptr = a_cpu.ctypes.data
+        prev_device = runtime.getDevice()
+        try:
+            runtime.setDevice(self.device.id)
+            if stream is not None:
+                a_gpu.data.copy_to_host_async(ptr, a_gpu.nbytes, stream)
+            else:
+                stream_ptr = stream_module.get_current_stream_ptr()
+                if stream_ptr == 0:
+                    a_gpu.data.copy_to_host(ptr, a_gpu.nbytes)
+                else:
+                    a_gpu.data.copy_to_host_async(ptr, a_gpu.nbytes)
+        finally:
+            runtime.setDevice(prev_device)
+        return a_cpu
+
+    cpdef set(self, arr, stream=None):
+        """Copies an array on the host memory to :class:`cupy.ndarray`.
+
+        Args:
+            arr (numpy.ndarray): The source array on the host memory.
+            stream (cupy.cuda.Stream): CUDA stream object. If it is given, the
+                copy runs asynchronously. Otherwise, the copy is synchronous.
+                The default uses CUDA stream object of the current context.
+
+        """
+        if not isinstance(arr, numpy.ndarray):
+            raise TypeError('Only numpy.ndarray can be set to cupy.ndarray')
+        if self.dtype != arr.dtype:
+            raise TypeError('{} array cannot be set to {} array'.format(
+                arr.dtype, self.dtype))
+        if self.shape != arr.shape:
+            raise ValueError(
+                'Shape mismatch. Old shape: {}, new shape: {}'.format(
+                    self.shape, arr.shape))
+        if self._c_contiguous:
+            arr = numpy.ascontiguousarray(arr)
+        elif self._f_contiguous:
+            arr = numpy.asfortranarray(arr)
+        else:
+            raise RuntimeError('Cannot set to non-contiguous array')
+
+        ptr = arr.ctypes.data
+        prev_device = runtime.getDevice()
+        try:
+            runtime.setDevice(self.device.id)
+            if stream is not None:
+                self.data.copy_from_host_async(ptr, self.nbytes, stream)
+            else:
+                stream_ptr = stream_module.get_current_stream_ptr()
+                if stream_ptr == 0:
+                    self.data.copy_from_host(ptr, self.nbytes)
+                else:
+                    self.data.copy_from_host_async(ptr, self.nbytes)
+        finally:
+            runtime.setDevice(prev_device)
+
+    cpdef _ndarray_base reduced_view(self, dtype=None):
+        """Returns a view of the array with minimum number of dimensions.
+
+        Args:
+            dtype: (Deprecated) Data type specifier.
+                If it is given, then the memory
+                sequence is reinterpreted as the new type.
+
+        Returns:
+            cupy.ndarray: A view of the array with reduced dimensions.
+
+        """
+        cdef shape_t shape
+        cdef strides_t strides
+        cdef Py_ssize_t ndim
+        cdef _ndarray_base view
+        if dtype is not None:
+            warnings.warn(
+                'calling reduced_view with dtype is deprecated',
+                DeprecationWarning)
+            return self.reduced_view().view(dtype)
+
+        ndim = self._shape.size()
+        if ndim <= 1:
+            return self
+        if self._c_contiguous:
+            view = self.view()
+            view._shape.assign(1, self.size)
+            view._strides.assign(1, self.dtype.itemsize)
+            view._update_f_contiguity()
+            return view
+
+        internal.get_reduced_dims(
+            self._shape, self._strides, self.dtype.itemsize, shape, strides)
+        if ndim == <Py_ssize_t>shape.size():
+            return self
+
+        # TODO(niboshi): Confirm update_x_contiguity flags
+        return self._view(type(self), shape, strides, False, True, self)
+
+    cpdef _update_c_contiguity(self):
+        if self.size == 0:
+            self._c_contiguous = True
+            return
+        self._c_contiguous = internal.get_c_contiguity(
+            self._shape, self._strides, self.dtype.itemsize)
+
+    cpdef _update_f_contiguity(self):
+        cdef Py_ssize_t i, count
+        cdef shape_t rev_shape
+        cdef strides_t rev_strides
+        if self.size == 0:
+            self._f_contiguous = True
+            return
+        if self._c_contiguous:
+            count = 0
+            for i in self._shape:
+                if i == 1:
+                    count += 1
+            self._f_contiguous = (<Py_ssize_t>self._shape.size()) - count <= 1
+            return
+        rev_shape.assign(self._shape.rbegin(), self._shape.rend())
+        rev_strides.assign(self._strides.rbegin(), self._strides.rend())
+        self._f_contiguous = internal.get_c_contiguity(
+            rev_shape, rev_strides, self.dtype.itemsize)
+
+    cpdef _update_contiguity(self):
+        self._update_c_contiguity()
+        self._update_f_contiguity()
+
+    cpdef _set_shape_and_strides(self, const shape_t& shape,
+                                 const strides_t& strides,
+                                 bint update_c_contiguity,
+                                 bint update_f_contiguity):
+        if shape.size() != strides.size():
+            raise ValueError('len(shape) != len(strides)')
+        if shape.size() > _carray.MAX_NDIM:
+            msg = 'maximum supported dimension for an ndarray is '
+            msg += f'{_carray.MAX_NDIM}, found {shape.size()}'
+            raise ValueError(msg)
+        self._shape = shape
+        self._strides = strides
+        self.size = internal.prod(shape)
+        if update_c_contiguity:
+            self._update_c_contiguity()
+        if update_f_contiguity:
+            self._update_f_contiguity()
+
+    cdef _ndarray_base _view(self, subtype, const shape_t& shape,
+                             const strides_t& strides,
+                             bint update_c_contiguity,
+                             bint update_f_contiguity, obj):
+        cdef _ndarray_base v
+        # Use `_no_init=True` to skip recomputation of contiguity. Now
+        # calling `__array_finalize__` is responsibility of this method.`
+        v = ndarray.__new__(subtype, _obj=obj, _no_init=True)
+        v.data = self.data
+        v.base = self.base if self.base is not None else self
+        v.dtype = self.dtype
+        v._c_contiguous = self._c_contiguous
+        v._f_contiguous = self._f_contiguous
+        v._index_32_bits = self._index_32_bits
+        v._set_shape_and_strides(
+            shape, strides, update_c_contiguity, update_f_contiguity)
+        if subtype is not ndarray:
+            v.__array_finalize__(self)
+        return v
+
+    cpdef _set_contiguous_strides(
+            self, Py_ssize_t itemsize, bint is_c_contiguous):
+        self.size = internal.get_contiguous_strides_inplace(
+            self._shape, self._strides, itemsize, is_c_contiguous, True)
+        if is_c_contiguous:
+            self._c_contiguous = True
+            self._update_f_contiguity()
+        else:
+            self._f_contiguous = True
+            self._update_c_contiguity()
+
+    cdef function.CPointer get_pointer(self):
+        return _CArray_from_ndarray(self)
+
+    cpdef object toDlpack(self):
+        """Zero-copy conversion to a DLPack tensor.
+
+        DLPack is a open in memory tensor structure proposed in this
+        repository: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.
+
+        This function returns a :class:`PyCapsule` object which contains a
+        pointer to a DLPack tensor converted from the own ndarray. This
+        function does not copy the own data to the output DLpack tensor
+        but it shares the pointer which is pointing to the same memory region
+        for the data.
+
+        Returns:
+            dltensor (:class:`PyCapsule`): Output DLPack tensor which is
+            encapsulated in a :class:`PyCapsule` object.
+
+        .. seealso::
+
+            :meth:`~cupy.fromDlpack` is a method for zero-copy conversion from
+            a DLPack tensor (which is encapsulated in a :class:`PyCapsule`
+            object) to a :class:`ndarray`
+
+        .. warning::
+
+            As of the DLPack v0.3 specification, it is (implicitly) assumed
+            that the user is responsible to ensure the Producer and the
+            Consumer are operating on the same stream. This requirement might
+            be relaxed/changed in a future DLPack version.
+
+        .. admonition:: Example
+
+            >>> import cupy
+            >>> array1 = cupy.array([0, 1, 2], dtype=cupy.float32)
+            >>> dltensor = array1.toDlpack()
+            >>> array2 = cupy.fromDlpack(dltensor)
+            >>> cupy.testing.assert_array_equal(array1, array2)
+
+        """
+        return dlpack.toDlpack(self)
+
+
+cdef inline _carray.CArray _CArray_from_ndarray(_ndarray_base arr):
+    # Creates CArray from ndarray.
+    # Note that this function cannot be defined in _carray.pxd because that
+    # would cause cyclic cimport dependencies.
+    cdef _carray.CArray carr = _carray.CArray.__new__(_carray.CArray)
+    carr.init(<void*>arr.data.ptr, arr.size, arr._shape, arr._strides)
+    return carr
+
+
+_HANDLED_TYPES = (ndarray, numpy.ndarray)
+
+
+# =============================================================================
+# compile_with_cache
+# =============================================================================
+# TODO(niboshi): Move it out of core.pyx
+
+cdef bint _is_hip = runtime._is_hip_environment
+cdef int _cuda_runtime_version = -1
+cdef str _cuda_path = ''  # '' for uninitialized, None for non-existing
+
+cdef list cupy_header_list = [
+    'cupy/complex.cuh',
+    'cupy/carray.cuh',
+    'cupy/atomics.cuh',
+    'cupy/math_constants.h',
+]
+if _is_hip:
+    cupy_header_list.append('cupy/hip_workaround.cuh')
+
+# expose to Python for unit testing
+_cupy_header_list = cupy_header_list
+
+cdef str _cupy_header = ''.join(
+    ['#include <%s>\n' % i for i in cupy_header_list])
+
+# This is indirect include header list.
+# These header files are subject to a hash key.
+cdef list _cupy_extra_header_list = [
+    'cupy/complex/complex.h',
+    'cupy/complex/math_private.h',
+    'cupy/complex/complex_inl.h',
+    'cupy/complex/arithmetic.h',
+    'cupy/complex/cproj.h',
+    'cupy/complex/cexp.h',
+    'cupy/complex/cexpf.h',
+    'cupy/complex/clog.h',
+    'cupy/complex/clogf.h',
+    'cupy/complex/cpow.h',
+    'cupy/complex/ccosh.h',
+    'cupy/complex/ccoshf.h',
+    'cupy/complex/csinh.h',
+    'cupy/complex/csinhf.h',
+    'cupy/complex/ctanh.h',
+    'cupy/complex/ctanhf.h',
+    'cupy/complex/csqrt.h',
+    'cupy/complex/csqrtf.h',
+    'cupy/complex/catrig.h',
+    'cupy/complex/catrigf.h',
+    'cupy/swap.cuh',
+    'cupy/tuple/type_traits.h',
+    'cupy/tuple/tuple.h',
+    'cupy/tuple.cuh',
+]
+
+cdef str _header_path_cache = None
+cdef str _header_source = None
+cdef dict _header_source_map = {}
+
+
+cpdef str _get_header_dir_path():
+    global _header_path_cache
+    if _header_path_cache is None:
+        # Cython cannot use __file__ in global scope
+        _header_path_cache = os.path.abspath(
+            os.path.join(os.path.dirname(__file__), 'include'))
+    return _header_path_cache
+
+
+cpdef str _get_header_source():
+    global _header_source
+    global _header_source_map
+    cdef str header_path, base_path, file_path, header
+    cdef list source
+
+    if _header_source is None or not _header_source_map:
+        source = []
+        base_path = _get_header_dir_path()
+        for file_path in _cupy_extra_header_list + cupy_header_list:
+            header_path = os.path.join(base_path, file_path)
+            with open(header_path) as header_file:
+                header = header_file.read()
+            source.append(header)
+            _header_source_map[file_path.encode()] = header.encode()
+        _header_source = '\n'.join(source)
+    return _header_source
+
+
+cpdef dict _get_header_source_map():
+    global _header_source_map
+    if not _header_source_map:
+        _get_header_source()
+    return _header_source_map
+
+
+# added at the module level for precompiling the regex
+_cucomplex_include_tokens = ['', '#', 'include', '<', r'cuComplex\.h', '>']
+_cucomplex_include_pattern = re.compile(r'\s*'.join(_cucomplex_include_tokens))
+
+
+cdef inline str _translate_cucomplex_to_thrust(str source):
+    lines = []
+    for line in source.splitlines(keepends=True):
+        if _cucomplex_include_pattern.match(line):
+            lines += '#include <cupy/cuComplex_bridge.h>  '\
+                     '// translate_cucomplex\n'
+        else:
+            lines += line
+    return ''.join(lines)
+
+
+cpdef function.Module compile_with_cache(
+        str source, tuple options=(), arch=None, cachd_dir=None,
+        prepend_cupy_headers=True, backend='nvrtc', translate_cucomplex=False,
+        enable_cooperative_groups=False, name_expressions=None,
+        log_stream=None, bint jitify=False):
+    if translate_cucomplex:
+        source = _translate_cucomplex_to_thrust(source)
+        cupy_header_list.append('cupy/cuComplex_bridge.h')
+        prepend_cupy_headers = True
+
+    if prepend_cupy_headers:
+        source = _cupy_header + source
+    extra_source = _get_header_source()
+    options += ('-I%s' % _get_header_dir_path(),)
+
+    # The variable _cuda_runtime_version is declared in cupy/_core/core.pyx,
+    # but it might not have been set appropriately before coming here.
+    global _cuda_runtime_version
+    if _cuda_runtime_version < 0:
+        _cuda_runtime_version = runtime.runtimeGetVersion()
+
+    global _cuda_path
+    if _cuda_path == '':
+        if not _is_hip:
+            _cuda_path = cuda.get_cuda_path()
+        else:
+            _cuda_path = cuda.get_rocm_path()
+
+    if not _is_hip:
+        if 10020 <= _cuda_runtime_version < 10030:
+            bundled_include = 'cuda-10.2'
+        elif 11000 <= _cuda_runtime_version < 11010:
+            bundled_include = 'cuda-11.0'
+        elif 11010 <= _cuda_runtime_version < 11020:
+            bundled_include = 'cuda-11.1'
+        elif 11020 <= _cuda_runtime_version < 12000:
+            # CUDA Enhanced Compatibility
+            bundled_include = 'cuda-11'
+        elif 12000 <= _cuda_runtime_version < 13000:
+            # CUDA Enhanced Compatibility
+            bundled_include = 'cuda-12'
+        else:
+            # CUDA versions not yet supported.
+            bundled_include = None
+
+        if bundled_include is None and _cuda_path is None:
+            raise RuntimeError(
+                'Failed to auto-detect CUDA root directory. '
+                'Please specify `CUDA_PATH` environment variable if you '
+                'are using CUDA versions not yet supported by CuPy.')
+
+        if bundled_include is not None:
+            options += ('-I' + os.path.join(
+                _get_header_dir_path(), 'cupy', '_cuda', bundled_include),)
+    elif _is_hip:
+        if _cuda_path is None:
+            raise RuntimeError(
+                'Failed to auto-detect ROCm root directory. '
+                'Please specify `ROCM_HOME` environment variable.')
+
+    if _cuda_path is not None:
+        options += ('-I' + os.path.join(_cuda_path, 'include'),)
+
+    return cuda.compiler._compile_module_with_cache(
+        source, options, arch, cachd_dir, extra_source, backend,
+        enable_cooperative_groups=enable_cooperative_groups,
+        name_expressions=name_expressions, log_stream=log_stream,
+        jitify=jitify)
+
+
+# =============================================================================
+# Routines
+# =============================================================================
+
+cdef str _id = 'out0 = in0'
+
+cdef fill_kernel = ElementwiseKernel('T x', 'T y', 'y = x', 'cupy_fill')
+
+cdef str _divmod_float = '''
+    out0_type a = _floor_divide(in0, in1);
+    out0 = a;
+    out1 = in0 - a * in1'''
+
+
+divmod = create_ufunc(
+    'cupy_divmod',
+    ('bb->bb', 'BB->BB', 'hh->hh', 'HH->HH', 'ii->ii', 'II->II', 'll->ll',
+     'LL->LL', 'qq->qq', 'QQ->QQ',
+     ('ee->ee', _divmod_float),
+     ('ff->ff', _divmod_float),
+     ('dd->dd', _divmod_float)),
+    '''
+    if (in1 == 0) {
+        out0 = 0;
+        out1 = 0;
+    } else {
+        out0_type a = _floor_divide(in0, in1);
+        out0 = a;
+        out1 = in0 - a * in1;
+    }''')
+
+
+cdef _round_preamble = '''
+#ifdef __HIP_DEVICE_COMPILE__
+#define round_float llrintf
+#else
+#define round_float __float2ll_rn
+#endif
+
+template<typename T> __device__ T pow10(long long n){
+  T x = 1, a = 10;
+  while (n) {
+    if (n & 1) x *= a;
+    a *= a;
+    n >>= 1;
+  }
+  return x;
+};
+'''
+
+
+cdef _round_float = '''
+if (in1 == 0) {
+    out0 = rint(in0);
+} else {
+    double x;
+    x = pow10<double>(abs(in1));  // TODO(okuta): Move before loop
+    out0 = in1 < 0 ? rint(in0 / x) * x : rint(in0 * x) / x;
+}'''
+
+cdef _round_complex = '''
+if (in1 == 0) {
+    out0 = in0_type(rint(in0.real()), rint(in0.imag()));
+} else {
+    double x = pow10<double>(abs(in1));  // TODO(okuta): Move before loop
+    if (in1 < 0) {
+        out0 = in0_type(rint(in0.real() / x) * x,
+                        rint(in0.imag() / x) * x);
+    } else {
+        out0 = in0_type(rint(in0.real() * x) / x,
+                        rint(in0.imag() * x) / x);
+    }
+}'''
+
+
+# There is a known incompatibility with NumPy (as of 1.16.4) such as
+# `numpy.around(2**63, -1) == cupy.around(2**63, -1)` gives `False`.
+#
+# NumPy seems to round integral values via double.  As double has
+# only 53 bit precision, last few bits of (u)int64 value may be lost.
+# As a consequence, `numpy.around(2**63, -1)` does NOT round up the
+# last digit (9223372036854775808 instead of ...810).
+#
+# The following code fixes the problem, so `cupy.around(2**63, -1)`
+# gives `...810`, which (may correct but) is incompatible with NumPy.
+_round_ufunc = create_ufunc(
+    'cupy_round',
+    ('?q->e',
+     'bq->b', 'Bq->B', 'hq->h', 'Hq->H', 'iq->i', 'Iq->I', 'lq->l', 'Lq->L',
+     'qq->q', 'Qq->Q',
+     ('eq->e', _round_float),
+     ('fq->f', _round_float),
+     ('dq->d', _round_float),
+     ('Fq->F', _round_complex),
+     ('Dq->D', _round_complex)),
+    '''
+    if (in1 >= 0) {
+        out0 = in0;
+    } else {
+        // TODO(okuta): Move before loop
+        long long x = pow10<long long>(-in1 - 1);
+
+        // TODO(okuta): Check Numpy
+        // `cupy.around(-123456789, -4)` works as follows:
+        // (1) scale by `x` above: -123456.789
+        // (2) split at the last 2 digits: -123400 + (-5.6789 * 10)
+        // (3) round the latter by `rint()`: -123400 + (-6.0 * 10)
+        // (4) unscale by `x` above: -123460000
+        long long q = in0 / x / 100;
+        int r = in0 - q*x*100;
+        out0 = (q*100 + round_float(r/(x*10.0f))*10) * x;
+    }''', preamble=_round_preamble)
+
+
+# -----------------------------------------------------------------------------
+# Array creation routines
+# -----------------------------------------------------------------------------
+
+cpdef _ndarray_base array(obj, dtype=None, bint copy=True, order='K',
+                          bint subok=False, Py_ssize_t ndmin=0):
+    # TODO(beam2d): Support subok options
+    if subok:
+        raise NotImplementedError
+    if order is None:
+        order = 'K'
+
+    if isinstance(obj, ndarray):
+        return _array_from_cupy_ndarray(obj, dtype, copy, order, ndmin)
+
+    if hasattr(obj, '__cuda_array_interface__'):
+        return _array_from_cuda_array_interface(
+            obj, dtype, copy, order, subok, ndmin)
+    if hasattr(obj, '__cupy_get_ndarray__'):
+        return _array_from_cupy_ndarray(
+            obj.__cupy_get_ndarray__(), dtype, copy, order, ndmin)
+
+    concat_shape, concat_type, concat_dtype = (
+        _array_info_from_nested_sequence(obj))
+    if concat_shape is not None:
+        return _array_from_nested_sequence(
+            obj, dtype, order, ndmin, concat_shape, concat_type, concat_dtype)
+
+    return _array_default(obj, dtype, order, ndmin)
+
+
+cdef _ndarray_base _array_from_cupy_ndarray(
+        obj, dtype, bint copy, order, Py_ssize_t ndmin):
+    cdef Py_ssize_t ndim
+    cdef _ndarray_base a, src
+
+    src = obj
+
+    if dtype is None:
+        dtype = src.dtype
+
+    if src.data.device_id == device.get_device_id():
+        a = src.astype(dtype, order=order, copy=copy)
+    else:
+        a = src.copy(order=order).astype(dtype, copy=False)
+
+    ndim = a._shape.size()
+    if ndmin > ndim:
+        if a is obj:
+            # When `copy` is False, `a` is same as `obj`.
+            a = a.view()
+        a.shape = (1,) * (ndmin - ndim) + a.shape
+
+    return a
+
+
+cdef _ndarray_base _array_from_cuda_array_interface(
+        obj, dtype, bint copy, order, bint subok, Py_ssize_t ndmin):
+    return array(
+        _convert_object_with_cuda_array_interface(obj),
+        dtype, copy, order, subok, ndmin)
+
+
+cdef _ndarray_base _array_from_nested_sequence(
+        obj, dtype, order, Py_ssize_t ndmin, concat_shape, concat_type,
+        concat_dtype):
+    cdef Py_ssize_t ndim
+
+    # resulting array is C order unless 'F' is explicitly specified
+    # (i.e., it ignores order of element arrays in the sequence)
+    order = (
+        'F'
+        if order is not None and len(order) >= 1 and order[0] in 'Ff'
+        else 'C')
+
+    ndim = len(concat_shape)
+    if ndmin > ndim:
+        concat_shape = (1,) * (ndmin - ndim) + concat_shape
+
+    if dtype is None:
+        dtype = concat_dtype.newbyteorder('<')
+
+    if concat_type is numpy.ndarray:
+        return _array_from_nested_numpy_sequence(
+            obj, concat_dtype, dtype, concat_shape, order, ndmin)
+    elif concat_type is ndarray:  # TODO(takagi) Consider subclases
+        return _array_from_nested_cupy_sequence(
+            obj, dtype, concat_shape, order)
+    else:
+        assert False
+
+
+cdef _ndarray_base _array_from_nested_numpy_sequence(
+        arrays, src_dtype, dst_dtype, const shape_t& shape, order,
+        Py_ssize_t ndmin):
+    a_dtype = get_dtype(dst_dtype)  # convert to numpy.dtype
+    if a_dtype.char not in '?bhilqBHILQefdFD':
+        raise ValueError('Unsupported dtype %s' % a_dtype)
+    cdef _ndarray_base a  # allocate it after pinned memory is secured
+    cdef size_t itemcount = internal.prod(shape)
+    cdef size_t nbytes = itemcount * a_dtype.itemsize
+
+    stream = stream_module.get_current_stream()
+    # Note: even if arrays are already backed by pinned memory, we still need
+    # to allocate an extra buffer and copy from it to avoid potential data
+    # race, see the discussion here:
+    # https://github.com/cupy/cupy/pull/5155#discussion_r621808782
+    cdef pinned_memory.PinnedMemoryPointer mem = (
+        _alloc_async_transfer_buffer(nbytes))
+    if mem is not None:
+        # write concatenated arrays to the pinned memory directly
+        src_cpu = (
+            numpy.frombuffer(mem, a_dtype, itemcount)
+            .reshape(shape, order=order))
+        _concatenate_numpy_array(
+            [numpy.expand_dims(e, 0) for e in arrays],
+            0,
+            get_dtype(src_dtype),
+            a_dtype,
+            src_cpu)
+        a = ndarray(shape, dtype=a_dtype, order=order)
+        a.data.copy_from_host_async(mem.ptr, nbytes)
+        pinned_memory._add_to_watch_list(stream.record(), mem)
+    else:
+        # fallback to numpy array and send it to GPU
+        # Note: a_cpu.ndim is always >= 1
+        a_cpu = numpy.array(arrays, dtype=a_dtype, copy=False, order=order,
+                            ndmin=ndmin)
+        a = ndarray(shape, dtype=a_dtype, order=order)
+        a.data.copy_from_host(a_cpu.ctypes.data, nbytes)
+
+    return a
+
+
+cdef _ndarray_base _array_from_nested_cupy_sequence(obj, dtype, shape, order):
+    lst = _flatten_list(obj)
+
+    # convert each scalar (0-dim) ndarray to 1-dim
+    lst = [cupy.expand_dims(x, 0) if x.ndim == 0 else x for x in lst]
+
+    a = _manipulation.concatenate_method(lst, 0)
+    a = a.reshape(shape)
+    a = a.astype(dtype, order=order, copy=False)
+    return a
+
+
+cdef _ndarray_base _array_default(obj, dtype, order, Py_ssize_t ndmin):
+    if order is not None and len(order) >= 1 and order[0] in 'KAka':
+        if isinstance(obj, numpy.ndarray) and obj.flags.fnc:
+            order = 'F'
+        else:
+            order = 'C'
+    a_cpu = numpy.array(obj, dtype=dtype, copy=False, order=order,
+                        ndmin=ndmin)
+    if a_cpu.dtype.char not in '?bhilqBHILQefdFD':
+        raise ValueError('Unsupported dtype %s' % a_cpu.dtype)
+    a_cpu = a_cpu.astype(a_cpu.dtype.newbyteorder('<'), copy=False)
+    a_dtype = a_cpu.dtype
+    cdef shape_t a_shape = a_cpu.shape
+    cdef _ndarray_base a = ndarray(a_shape, dtype=a_dtype, order=order)
+    if a_cpu.ndim == 0:
+        a.fill(a_cpu)
+        return a
+    cdef Py_ssize_t nbytes = a.nbytes
+
+    stream = stream_module.get_current_stream()
+    # Note: even if obj is already backed by pinned memory, we still need to
+    # allocate an extra buffer and copy from it to avoid potential data race,
+    # see the discussion here:
+    # https://github.com/cupy/cupy/pull/5155#discussion_r621808782
+    cdef pinned_memory.PinnedMemoryPointer mem = (
+        _alloc_async_transfer_buffer(nbytes))
+    if mem is not None:
+        src_cpu = numpy.frombuffer(mem, a_dtype, a_cpu.size)
+        src_cpu[:] = a_cpu.ravel(order)
+        a.data.copy_from_host_async(mem.ptr, nbytes)
+        pinned_memory._add_to_watch_list(stream.record(), mem)
+    else:
+        a.data.copy_from_host(a_cpu.ctypes.data, nbytes)
+
+    return a
+
+
+cdef tuple _array_info_from_nested_sequence(obj):
+    # Returns a tuple containing information if we can simply concatenate the
+    # input to make a CuPy array (i.e., a (nested) sequence that only contains
+    # NumPy/CuPy arrays with the same shape and dtype). `(None, None, None)`
+    # means we do not concatenate the input.
+    # 1. A concatenated shape
+    # 2. The type of the arrays to concatenate (numpy.ndarray or cupy.ndarray)
+    # 3. The dtype of the arrays to concatenate
+    if isinstance(obj, (list, tuple)):
+        return _compute_concat_info_impl(obj)
+    else:
+        return None, None, None
+
+
+cdef tuple _compute_concat_info_impl(obj):
+    cdef Py_ssize_t dim
+
+    if isinstance(obj, (numpy.ndarray, ndarray)):
+        return obj.shape, type(obj), obj.dtype
+
+    if hasattr(obj, '__cupy_get_ndarray__'):
+        return obj.shape, ndarray, obj.dtype
+
+    if isinstance(obj, (list, tuple)):
+        dim = len(obj)
+        if dim == 0:
+            return None, None, None
+
+        concat_shape, concat_type, concat_dtype = (
+            _compute_concat_info_impl(obj[0]))
+        if concat_shape is None:
+            return None, None, None
+
+        for elem in obj[1:]:
+            concat_shape1, concat_type1, concat_dtype1 = (
+                _compute_concat_info_impl(elem))
+            if concat_shape1 is None:
+                return None, None, None
+
+            if concat_shape != concat_shape1:
+                return None, None, None
+            if concat_type is not concat_type1:
+                return None, None, None
+            if concat_dtype != concat_dtype1:
+                concat_dtype = numpy.promote_types(concat_dtype, concat_dtype1)
+
+        return (dim,) + concat_shape, concat_type, concat_dtype
+
+    return None, None, None
+
+
+cdef list _flatten_list(object obj):
+    ret = []
+    if isinstance(obj, (list, tuple)):
+        for elem in obj:
+            ret += _flatten_list(elem)
+        return ret
+    return [obj]
+
+
+cdef bint _numpy_concatenate_has_out_argument = (
+    numpy.lib.NumpyVersion(numpy.__version__) >= '1.14.0')
+
+
+cdef inline _concatenate_numpy_array(arrays, axis, src_dtype, dst_dtype, out):
+    # type(*_dtype) must be numpy.dtype
+
+    if (_numpy_concatenate_has_out_argument
+            and src_dtype.kind == dst_dtype.kind):
+        # concatenate only accepts same_kind casting
+        numpy.concatenate(arrays, axis, out)
+    else:
+        out[:] = numpy.concatenate(arrays, axis)
+
+
+cdef inline _alloc_async_transfer_buffer(Py_ssize_t nbytes):
+    try:
+        return pinned_memory.alloc_pinned_memory(nbytes)
+    except CUDARuntimeError as e:
+        if e.status != runtime.errorMemoryAllocation:
+            raise
+        warnings.warn(
+            'Using synchronous transfer as pinned memory ({} bytes) '
+            'could not be allocated. '
+            'This generally occurs because of insufficient host memory. '
+            'The original error was: {}'.format(nbytes, e),
+            _util.PerformanceWarning)
+
+    return None
+
+
+cpdef _ndarray_base _internal_ascontiguousarray(_ndarray_base a):
+    if a._c_contiguous:
+        return a
+    newarray = _ndarray_init(ndarray, a._shape, a.dtype, None)
+    elementwise_copy(a, newarray)
+    return newarray
+
+
+cpdef _ndarray_base _internal_asfortranarray(_ndarray_base a):
+    cdef _ndarray_base newarray
+    cdef int m, n
+    cdef intptr_t handle
+
+    if a._f_contiguous:
+        return a
+
+    newarray = ndarray(a.shape, a.dtype, order='F')
+    if (a._c_contiguous and a._shape.size() == 2 and
+            (a.dtype == numpy.float32 or a.dtype == numpy.float64)):
+        m, n = a.shape
+        handle = device.get_cublas_handle()
+        one = numpy.array(1, dtype=a.dtype)
+        zero = numpy.array(0, dtype=a.dtype)
+        if a.dtype == numpy.float32:
+            cublas.sgeam(
+                handle,
+                1,  # transpose a
+                1,  # transpose newarray
+                m, n, one.ctypes.data, a.data.ptr, n,
+                zero.ctypes.data, a.data.ptr, n, newarray.data.ptr, m)
+        elif a.dtype == numpy.float64:
+            cublas.dgeam(
+                handle,
+                1,  # transpose a
+                1,  # transpose newarray
+                m, n, one.ctypes.data, a.data.ptr, n,
+                zero.ctypes.data, a.data.ptr, n, newarray.data.ptr, m)
+    else:
+        elementwise_copy(a, newarray)
+    return newarray
+
+
+cpdef _ndarray_base ascontiguousarray(_ndarray_base a, dtype=None):
+    cdef bint same_dtype = False
+    zero_dim = a._shape.size() == 0
+    if dtype is None:
+        same_dtype = True
+        dtype = a.dtype
+    else:
+        dtype = get_dtype(dtype)
+        same_dtype = dtype == a.dtype
+
+    if same_dtype and a._c_contiguous:
+        if zero_dim:
+            return _manipulation._ndarray_ravel(a, 'C')
+        return a
+
+    shape = (1,) if zero_dim else a.shape
+    newarray = ndarray(shape, dtype)
+    elementwise_copy(a, newarray)
+    return newarray
+
+
+cpdef _ndarray_base asfortranarray(_ndarray_base a, dtype=None):
+    cdef _ndarray_base newarray
+    cdef bint same_dtype = False
+    zero_dim = a._shape.size() == 0
+
+    if dtype is None:
+        dtype = a.dtype
+        same_dtype = True
+    else:
+        dtype = get_dtype(dtype)
+        same_dtype = dtype == a.dtype
+
+    if same_dtype and a._f_contiguous:
+        if zero_dim:
+            return _manipulation._ndarray_ravel(a, 'F')
+        return a
+
+    if same_dtype and not zero_dim:
+        return _internal_asfortranarray(a)
+
+    newarray = ndarray((1,) if zero_dim else a.shape, dtype, order='F')
+    elementwise_copy(a, newarray)
+    return newarray
+
+
+cpdef _ndarray_base _convert_object_with_cuda_array_interface(a):
+    if runtime._is_hip_environment:
+        raise RuntimeError(
+            'HIP/ROCm does not support cuda array interface')
+
+    cdef Py_ssize_t sh, st
+    cdef dict desc = a.__cuda_array_interface__
+    cdef tuple shape = desc['shape']
+    cdef int dev_id = -1
+    cdef size_t nbytes
+
+    ptr = desc['data'][0]
+    dtype = numpy.dtype(desc['typestr'])
+    if dtype.byteorder == '>':
+        raise ValueError('CuPy does not support the big-endian byte-order')
+    mask = desc.get('mask')
+    if mask is not None:
+        raise ValueError('CuPy currently does not support masked arrays.')
+    strides = desc.get('strides')
+    if strides is not None:
+        nbytes = 0
+        for sh, st in zip(shape, strides):
+            nbytes = max(nbytes, abs(sh * st))
+    else:
+        nbytes = internal.prod_sequence(shape) * dtype.itemsize
+    # the v2 protocol sets ptr=0 for 0-size arrays, so we can't look up
+    # the pointer attributes and must use the current device
+    if nbytes == 0:
+        dev_id = device.get_device_id()
+    mem = memory_module.UnownedMemory(ptr, nbytes, a, dev_id)
+    memptr = memory.MemoryPointer(mem, 0)
+    # the v3 protocol requires an immediate synchronization, unless
+    # 1. the stream is not set (ex: from v0 ~ v2) or is None
+    # 2. users explicitly overwrite this requirement
+    stream_ptr = desc.get('stream')
+    if stream_ptr is not None:
+        if _util.CUDA_ARRAY_INTERFACE_SYNC:
+            runtime.streamSynchronize(stream_ptr)
+    return ndarray(shape, dtype, memptr, strides)
+
+
+cdef _ndarray_base _ndarray_init(subtype, const shape_t& shape, dtype, obj):
+    # Use `_no_init=True` for fast init. Now calling `__array_finalize__` is
+    # responsibility of this function.
+    cdef _ndarray_base ret = ndarray.__new__(subtype, _obj=obj, _no_init=True)
+    ret._init_fast(shape, dtype, True)
+    if subtype is not ndarray:
+        ret.__array_finalize__(obj)
+    return ret
+
+
+cdef _ndarray_base _create_ndarray_from_shape_strides(
+        subtype, const shape_t& shape, const strides_t& strides, dtype, obj):
+    cdef int ndim = shape.size()
+    cdef int64_t begin = 0, end = dtype.itemsize
+    cdef memory.MemoryPointer ptr
+    for i in range(ndim):
+        if strides[i] > 0:
+            end += strides[i] * (shape[i] - 1)
+        elif strides[i] < 0:
+            begin += strides[i] * (shape[i] - 1)
+    ptr = memory.alloc(end - begin) + begin
+    return ndarray.__new__(
+        subtype, shape, dtype, _obj=obj, memptr=ptr, strides=strides)
+
+
+cpdef min_scalar_type(a):
+    """
+    For scalar ``a``, returns the data type with the smallest size
+    and smallest scalar kind which can hold its value.  For non-scalar
+    array ``a``, returns the vector's dtype unmodified.
+
+    .. seealso:: :func:`numpy.min_scalar_type`
+    """
+    if isinstance(a, ndarray):
+        return a.dtype
+    _, concat_type, concat_dtype = _array_info_from_nested_sequence(a)
+    if concat_type is not None:
+        return concat_dtype
+    return numpy.min_scalar_type(a)
--- a/cupy/_core/dlpack.pxd
+++ b/cupy/_core/dlpack.pxd
+from cupy._core.core cimport _ndarray_base
+
+
+cdef extern from './include/cupy/dlpack/dlpack.h' nogil:
+    int device_CUDA 'kDLCUDA'
+    int managed_CUDA 'kDLCUDAManaged'
+    int device_ROCM 'kDLROCM'
+
+
+cpdef object toDlpack(_ndarray_base array) except +
+cpdef _ndarray_base fromDlpack(object dltensor) except +
+cpdef from_dlpack(array)
--- a/cupy/_core/dlpack.pyx
+++ b/cupy/_core/dlpack.pyx
+cimport cpython  # NOQA
+
+from libc cimport stdlib
+from libc.stdint cimport uint8_t
+from libc.stdint cimport uint16_t
+from libc.stdint cimport int32_t
+from libc.stdint cimport int64_t
+from libc.stdint cimport uint64_t
+from libc.stdint cimport intptr_t
+from libcpp.vector cimport vector
+
+from cupy_backends.cuda.api cimport runtime
+from cupy_backends.cuda cimport stream as stream_module
+from cupy._core.core cimport _ndarray_base
+from cupy.cuda cimport memory
+
+import warnings
+
+import cupy
+import cupy._core.core as core
+
+
+cdef extern from './include/cupy/dlpack/dlpack.h' nogil:
+    cdef int DLPACK_VERSION
+
+    cdef enum DLDeviceType:
+        kDLCPU
+        kDLCUDA
+        kDLCUDAHost
+        kDLOpenCL
+        kDLVulkan
+        kDLMetal
+        kDLVPI
+        kDLROCM
+        kDLROCMHost
+        kDLExtDev
+        kDLCUDAManaged
+        kDLOneAPI
+        kDLWebGPU
+        kDLHexagon
+
+    ctypedef struct DLDevice:
+        DLDeviceType device_type
+        int32_t device_id
+
+    cdef enum DLDataTypeCode:
+        kDLInt
+        kDLUInt
+        kDLFloat
+        kDLBfloat
+        kDLComplex
+        kDLBool
+
+    ctypedef struct DLDataType:
+        uint8_t code
+        uint8_t bits
+        uint16_t lanes
+
+    ctypedef struct DLTensor:
+        void* data
+        DLDevice device
+        int32_t ndim
+        DLDataType dtype
+        int64_t* shape
+        int64_t* strides
+        uint64_t byte_offset
+
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void* manager_ctx
+        void (*deleter)(DLManagedTensor*)  # noqa: E211
+
+
+def get_build_version():
+    return str(DLPACK_VERSION)
+
+
+cdef void pycapsule_deleter(object dltensor):
+    cdef DLManagedTensor* dlm_tensor
+    # Do not invoke the deleter on a used capsule
+    if cpython.PyCapsule_IsValid(dltensor, 'dltensor'):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+            dltensor, 'dltensor')
+        dlm_tensor.deleter(dlm_tensor)
+
+
+cdef void deleter(DLManagedTensor* tensor) with gil:
+    if tensor.manager_ctx is NULL:
+        return
+    stdlib.free(tensor.dl_tensor.shape)
+    cpython.Py_DECREF(<_ndarray_base>tensor.manager_ctx)
+    tensor.manager_ctx = NULL
+    stdlib.free(tensor)
+
+
+# The name of this function is following the framework integration guide of
+# TensorComprehensions.
+cpdef object toDlpack(_ndarray_base array) except +:
+    cdef DLManagedTensor* dlm_tensor = \
+        <DLManagedTensor*>stdlib.malloc(sizeof(DLManagedTensor))
+
+    cdef size_t ndim = array._shape.size()
+    cdef DLTensor* dl_tensor = &dlm_tensor.dl_tensor
+    cdef intptr_t data_ptr = array.data.ptr
+    dl_tensor.data = <void*>data_ptr
+    dl_tensor.ndim = ndim
+
+    cdef int64_t* shape_strides = \
+        <int64_t*>stdlib.malloc(ndim * sizeof(int64_t) * 2)
+    for n in range(ndim):
+        shape_strides[n] = array._shape[n]
+    dl_tensor.shape = shape_strides
+    for n in range(ndim):
+        shape_strides[n + ndim] = array._strides[n] // array.dtype.itemsize
+
+    dl_tensor.strides = shape_strides + ndim
+    dl_tensor.byte_offset = 0
+
+    cdef DLDevice* device = &dl_tensor.device
+    cdef bint is_managed
+    cdef int dev_id = array.data.device_id
+    if not runtime._is_hip_environment:
+        attrs = runtime.pointerGetAttributes(data_ptr)
+        is_managed = (attrs.type == runtime.memoryTypeManaged)
+        if is_managed:
+            device.device_type = kDLCUDAManaged
+            dev_id = 0  # make it accessible on CPU too
+        else:
+            device.device_type = kDLCUDA
+    else:
+        device.device_type = kDLROCM
+    device.device_id = dev_id
+
+    cdef DLDataType* dtype = &dl_tensor.dtype
+    if array.dtype.kind == 'u':
+        dtype.code = <uint8_t>kDLUInt
+    elif array.dtype.kind == 'i':
+        dtype.code = <uint8_t>kDLInt
+    elif array.dtype.kind == 'f':
+        dtype.code = <uint8_t>kDLFloat
+    elif array.dtype.kind == 'c':
+        dtype.code = <uint8_t>kDLComplex
+    elif array.dtype.kind == 'b':
+        dtype.code = <uint8_t>kDLBool
+    else:
+        raise ValueError('Unknown dtype')
+    dtype.lanes = <uint16_t>1
+    dtype.bits = <uint8_t>(array.dtype.itemsize * 8)
+
+    dlm_tensor.manager_ctx = <void*>array
+    cpython.Py_INCREF(array)
+    dlm_tensor.deleter = deleter
+
+    return cpython.PyCapsule_New(dlm_tensor, 'dltensor', pycapsule_deleter)
+
+
+# TODO(leofang): Support kDLCUDAPinned and kDLROCMPinned
+cdef class DLPackMemory(memory.BaseMemory):
+
+    """Memory object for a dlpack tensor.
+
+    This does not allocate any memory.
+
+    """
+
+    cdef DLManagedTensor* dlm_tensor
+    cdef object dltensor
+
+    def __init__(self, object dltensor):
+        cdef DLManagedTensor* dlm_tensor
+
+        # sanity checks
+        if not cpython.PyCapsule_IsValid(dltensor, 'dltensor'):
+            raise ValueError('A DLPack tensor object cannot be consumed '
+                             'multiple times')
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+            dltensor, 'dltensor')
+        if runtime._is_hip_environment:
+            if dlm_tensor.dl_tensor.device.device_type != kDLROCM:
+                raise RuntimeError('CuPy is built against ROCm/HIP, different '
+                                   'from the backend that backs the incoming '
+                                   'DLPack tensor')
+        else:
+            if dlm_tensor.dl_tensor.device.device_type not in (
+                    kDLCUDA, kDLCUDAManaged):
+                raise RuntimeError('CuPy is built against CUDA, different '
+                                   'from the backend that backs the incoming '
+                                   'DLPack tensor')
+
+        self.dltensor = dltensor
+        self.dlm_tensor = dlm_tensor
+        self.ptr = <intptr_t>dlm_tensor.dl_tensor.data
+        if dlm_tensor.dl_tensor.device.device_type == kDLCUDAManaged:
+            # look up the actual physical device as the id from
+            # dl_tensor could be 0
+            attrs = runtime.pointerGetAttributes(self.ptr)
+            self.device_id = attrs.device
+        else:
+            self.device_id = dlm_tensor.dl_tensor.device.device_id
+
+        cdef int n = 0, s = 0
+        cdef int ndim = dlm_tensor.dl_tensor.ndim
+        cdef int64_t* shape = dlm_tensor.dl_tensor.shape
+        for s in shape[:ndim]:
+            n += s
+        self.size = dlm_tensor.dl_tensor.dtype.bits * n // 8
+
+    def __dealloc__(self):
+        cdef DLManagedTensor* dlm_tensor = self.dlm_tensor
+        # dlm_tensor could be uninitialized if an error is raised in __init__
+        if dlm_tensor != NULL:
+            dlm_tensor.deleter(dlm_tensor)
+
+
+# The name of this function is following the framework integration guide of
+# TensorComprehensions.
+cpdef _ndarray_base fromDlpack(object dltensor) except +:
+    """Zero-copy conversion from a DLPack tensor to a :class:`~cupy.ndarray`.
+
+    DLPack is a open in memory tensor structure proposed in this repository:
+    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.
+
+    This function takes a :class:`PyCapsule` object which contains a pointer to
+    a DLPack tensor as input, and returns a :class:`~cupy.ndarray`. This
+    function does not copy the data in the DLPack tensor but both
+    DLPack tensor and :class:`~cupy.ndarray` have pointers which are pointing
+    to the same memory region for the data.
+
+    Args:
+        dltensor (:class:`PyCapsule`): Input DLPack tensor which is
+            encapsulated in a :class:`PyCapsule` object.
+
+    Returns:
+        array (:class:`~cupy.ndarray`): A CuPy ndarray.
+
+    .. warning::
+
+        This function is deprecated in favor of :func:`~cupy.from_dlpack` and
+        will be removed in a future version of CuPy.
+
+    .. warning::
+
+        As of the DLPack v0.5 specification, it is implicitly assumed that
+        the user is responsible to ensure the Producer and the Consumer are
+        operating on the same stream.
+
+    .. seealso::
+
+        :meth:`cupy.ndarray.toDlpack` is a method for zero-copy conversion
+        from a :class:`~cupy.ndarray` to a DLPack tensor (which is encapsulated
+        in a :class:`PyCapsule` object).
+
+    .. admonition:: Example
+
+        >>> import cupy
+        >>> array1 = cupy.array([0, 1, 2], dtype=cupy.float32)
+        >>> dltensor = array1.toDlpack()
+        >>> array2 = cupy.fromDlpack(dltensor)
+        >>> cupy.testing.assert_array_equal(array1, array2)
+
+    """
+    warnings.warn('This function is deprecated in favor of cupy.from_dlpack',
+                  DeprecationWarning)
+    return _dlpack_to_cupy_array(dltensor)
+
+
+cdef inline _ndarray_base _dlpack_to_cupy_array(dltensor) except +:
+    cdef DLPackMemory mem = DLPackMemory(dltensor)
+    cdef DLDataType dtype = mem.dlm_tensor.dl_tensor.dtype
+    cdef int bits = dtype.bits
+    if dtype.lanes != 1:
+        raise ValueError(f'vector dtypes (lanes={dtype.lanes}) is '
+                         'not supported')
+    if dtype.code == kDLUInt:
+        if bits == 8:
+            cp_dtype = cupy.uint8
+        elif bits == 16:
+            cp_dtype = cupy.uint16
+        elif bits == 32:
+            cp_dtype = cupy.uint32
+        elif bits == 64:
+            cp_dtype = cupy.uint64
+        else:
+            raise TypeError('uint{} is not supported.'.format(bits))
+    elif dtype.code == kDLInt:
+        if bits == 8:
+            cp_dtype = cupy.int8
+        elif bits == 16:
+            cp_dtype = cupy.int16
+        elif bits == 32:
+            cp_dtype = cupy.int32
+        elif bits == 64:
+            cp_dtype = cupy.int64
+        else:
+            raise TypeError('int{} is not supported.'.format(bits))
+    elif dtype.code == kDLFloat:
+        if bits == 16:
+            cp_dtype = cupy.float16
+        elif bits == 32:
+            cp_dtype = cupy.float32
+        elif bits == 64:
+            cp_dtype = cupy.float64
+        else:
+            raise TypeError('float{} is not supported.'.format(bits))
+    elif dtype.code == kDLComplex:
+        # TODO(leofang): support complex32
+        if bits == 64:
+            cp_dtype = cupy.complex64
+        elif bits == 128:
+            cp_dtype = cupy.complex128
+        else:
+            raise TypeError('complex{} is not supported.'.format(bits))
+    elif dtype.code == kDLBool:
+        if bits == 8:
+            cp_dtype = cupy.bool_
+        else:
+            raise TypeError(f'{bits}-bit bool is not supported')
+    elif dtype.code == kDLBfloat:
+        raise NotImplementedError('CuPy does not support bfloat16 yet')
+    else:
+        raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code))
+
+    mem_ptr = memory.MemoryPointer(mem, mem.dlm_tensor.dl_tensor.byte_offset)
+    cdef int64_t ndim = mem.dlm_tensor.dl_tensor.ndim
+
+    cdef int64_t* shape = mem.dlm_tensor.dl_tensor.shape
+    cdef vector[Py_ssize_t] shape_vec
+    shape_vec.assign(shape, shape + ndim)
+
+    if mem.dlm_tensor.dl_tensor.strides is NULL:
+        # Make sure this capsule will never be used again.
+        cpython.PyCapsule_SetName(mem.dltensor, 'used_dltensor')
+        return core.ndarray(shape_vec, cp_dtype, mem_ptr, strides=None)
+    cdef int64_t* strides = mem.dlm_tensor.dl_tensor.strides
+    cdef vector[Py_ssize_t] strides_vec
+    for i in range(ndim):
+        strides_vec.push_back(strides[i] * (bits // 8))
+
+    # Make sure this capsule will never be used again.
+    cpython.PyCapsule_SetName(mem.dltensor, 'used_dltensor')
+    return core.ndarray(shape_vec, cp_dtype, mem_ptr, strides=strides_vec)
+
+
+cpdef from_dlpack(array):
+    """Zero-copy conversion between array objects compliant with the DLPack
+    data exchange protocol.
+
+    Args:
+        array (object): an array object that implements two methods:
+            ``__dlpack__()`` and ``__dlpack_device__()``.
+
+    Returns:
+        cupy.ndarray: a CuPy array that can be safely accessed on CuPy's
+        current stream.
+
+    .. note::
+        This function is different from CuPy's legacy :func:`~cupy.fromDlpack`
+        function. This function takes any object implementing the DLPack data
+        exchange protocol, as well as a raw :class:`PyCapsule` object that
+        contains the DLPack tensor as input (for backward compatibility),
+        whereas :func:`~cupy.fromDlpack` only accepts :class:`PyCapsule`
+        objects. If the input object is not compliant with the protocol, users
+        are responsible to ensure data safety.
+
+    .. seealso::
+        :func:`numpy.from_dlpack`,
+        `Python Specification for DLPack`_,
+        `Data interchange mechanisms`_
+
+    .. _Python Specification for DLPack:
+        https://dmlc.github.io/dlpack/latest/python_spec.html
+    .. _Data interchange mechanisms:
+        https://data-apis.org/array-api/latest/design_topics/data_interchange.html
+    """
+    if not hasattr(array, '__dlpack_device__'):
+        # backward compatibility: accept passing in a pycapsule
+        dltensor = array
+        return _dlpack_to_cupy_array(dltensor)
+    else:
+        dev_type, dev_id = array.__dlpack_device__()
+
+    # CuPy is the consumer, so we provide our current stream to the producer
+    if dev_type == <int>kDLCUDA or dev_type == <int>kDLCUDAManaged:
+        prev_device = cupy.cuda.runtime.getDevice()
+        try:
+            cupy.cuda.runtime.setDevice(dev_id)
+            assert not runtime._is_hip_environment
+            stream = stream_module.get_current_stream_ptr()
+            if stream == 0:
+                stream = stream_module.get_default_stream_ptr()
+            dltensor = array.__dlpack__(stream=stream)
+        finally:
+            cupy.cuda.runtime.setDevice(prev_device)
+    elif dev_type == <int>kDLROCM:
+        prev_device = cupy.cuda.runtime.getDevice()
+        try:
+            cupy.cuda.runtime.setDevice(dev_id)
+            assert runtime._is_hip_environment
+            stream = stream_module.get_current_stream_ptr()
+            dltensor = array.__dlpack__(stream=stream)
+        finally:
+            cupy.cuda.runtime.setDevice(prev_device)
+    elif dev_type == <int>kDLCPU:
+        raise TypeError(
+            'CPU arrays cannot be directly imported to CuPy. '
+            'Use `cupy.array(numpy.from_dlpack(input))` instead.')
+    else:
+        # TODO(leofang): support kDLCUDAPinned etc
+        dltensor = None
+        raise TypeError(f'Unsupported array type: {dev_type}')
+
+    return _dlpack_to_cupy_array(dltensor)