clone code from github

deb763b7 · root · 93bf084b · deb763b7 · deb763b7 · deb763b7
Commit deb763b7 authored Feb 06, 2026 by root
20 changed files
--- a/CITATION.bib
+++ b/CITATION.bib
+@inproceedings{cupy_learningsys2017,
+  author       = "Okuta, Ryosuke and Unno, Yuya and Nishino, Daisuke and Hido, Shohei and Loomis, Crissman",
+  title        = "CuPy: A NumPy-Compatible Library for NVIDIA GPU Calculations",
+  booktitle    = "Proceedings of Workshop on Machine Learning Systems (LearningSys) in The Thirty-first Annual Conference on Neural Information Processing Systems (NIPS)",
+  year         = "2017",
+  url          = "http://learningsys.org/nips17/assets/papers/paper_16.pdf"
+}
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
+# CuPy Code of Conduct
+CuPy follows the [NumFOCUS Code of Conduct][homepage] available at https://numfocus.org/code-of-conduct.
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at `dlfw@preferred.jp`.
+[homepage]: https://numfocus.org/
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2015 Preferred Infrastructure, Inc.
+Copyright (c) 2015 Preferred Networks, Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
+# Contents of sdist. See also `setup.py`.
+recursive-include cupy *.h *.hpp
+recursive-include cupy *.pyx *.pxd *.pxi
+recursive-include cupy_backends *.h *.hpp
+recursive-include cupy_backends *.pyx *.pxd *.pxi
+# Fail-safe to avoid including Cythoinzed sources in sdist.
+recursive-exclude cupy *.cpp
+recursive-exclude cupy_backends *.cpp
+# Installers
+recursive-include install *.py
+recursive-include tests *.py
+# Licenses
+include LICENSE
+include docs/LICENSE_THIRD_PARTY
+include docs/source/license.rst
--- a/codecov.yml
+++ b/codecov.yml
+comment: false
+github_checks:
+  annotations: false
+coverage:
+  status:
+    # Disable coverage measurement for overall codebase.
+    project: off
+    # Enable coverage measurement for diff introduced in the pull-request,
+    # but do not mark "X" on commit status for now.
+    patch:
+      default:
+        target: '0%'
--- a/cupy/__init__.py
+++ b/cupy/__init__.py
+import functools as _functools
+import sys as _sys
+import numpy as _numpy
+from cupy import _environment
+from cupy import _version
+_environment._detect_duplicate_installation()  # NOQA
+_environment._setup_win32_dll_directory()  # NOQA
+_environment._preload_library('cutensor')  # NOQA
+_environment._preload_library('nccl')  # NOQA
+try:
+    from cupy import _core  # NOQA
+except ImportError as exc:
+    raise ImportError(f'''
+================================================================
+{_environment._diagnose_import_error()}
+Original error:
+  {type(exc).__name__}: {exc}
+================================================================
+''') from exc
+from cupy import cuda  # NOQA
+# Do not make `cupy.cupyx` available because it is confusing.
+import cupyx as _cupyx  # NOQA
+def is_available():
+    return cuda.is_available()
+__version__ = _version.__version__
+from cupy import fft  # NOQA
+from cupy import linalg  # NOQA
+from cupy import polynomial  # NOQA
+from cupy import random  # NOQA
+# `cupy.sparse` is deprecated in v8
+from cupy import sparse  # NOQA
+from cupy import testing  # NOQA  # NOQA
+# import class and function
+from cupy._core import ndarray  # NOQA
+from cupy._core import ufunc  # NOQA
+# =============================================================================
+# Constants (borrowed from NumPy)
+# =============================================================================
+from numpy import e  # NOQA
+from numpy import euler_gamma  # NOQA
+from numpy import inf  # NOQA
+from numpy import nan  # NOQA
+from numpy import newaxis  # == None  # NOQA
+from numpy import pi  # NOQA
+# APIs to be removed in NumPy 2.0.
+# Remove these when bumping the baseline API to NumPy 2.0.
+# https://github.com/cupy/cupy/pull/7800
+PINF = Inf = Infinity = infty = inf  # NOQA
+NINF = -inf  # NOQA
+NAN = NaN = nan  # NOQA
+PZERO = 0.0  # NOQA
+NZERO = -0.0  # NOQA
+# =============================================================================
+# Data types (borrowed from NumPy)
+#
+# The order of these declarations are borrowed from the NumPy document:
+# https://numpy.org/doc/stable/reference/arrays.scalars.html
+# =============================================================================
+# -----------------------------------------------------------------------------
+# Generic types
+# -----------------------------------------------------------------------------
+from numpy import complexfloating  # NOQA
+from numpy import floating  # NOQA
+from numpy import generic  # NOQA
+from numpy import inexact  # NOQA
+from numpy import integer  # NOQA
+from numpy import number  # NOQA
+from numpy import signedinteger  # NOQA
+from numpy import unsignedinteger  # NOQA
+# Not supported by CuPy:
+# from numpy import flexible
+# from numpy import character
+# -----------------------------------------------------------------------------
+# Booleans
+# -----------------------------------------------------------------------------
+from numpy import bool_  # NOQA
+# -----------------------------------------------------------------------------
+# Integers
+# -----------------------------------------------------------------------------
+from numpy import byte  # NOQA
+from numpy import short  # NOQA
+from numpy import intc  # NOQA
+from numpy import int_  # NOQA
+from numpy import longlong  # NOQA
+from numpy import intp  # NOQA
+from numpy import int8  # NOQA
+from numpy import int16  # NOQA
+from numpy import int32  # NOQA
+from numpy import int64  # NOQA
+# -----------------------------------------------------------------------------
+# Unsigned integers
+# -----------------------------------------------------------------------------
+from numpy import ubyte  # NOQA
+from numpy import ushort  # NOQA
+from numpy import uintc  # NOQA
+from numpy import uint  # NOQA
+from numpy import ulonglong  # NOQA
+from numpy import uintp  # NOQA
+from numpy import uint8  # NOQA
+from numpy import uint16  # NOQA
+from numpy import uint32  # NOQA
+from numpy import uint64  # NOQA
+# -----------------------------------------------------------------------------
+# Floating-point numbers
+# -----------------------------------------------------------------------------
+from numpy import half  # NOQA
+from numpy import single  # NOQA
+from numpy import double  # NOQA
+from numpy import float_  # NOQA
+from numpy import longfloat  # NOQA
+from numpy import float16  # NOQA
+from numpy import float32  # NOQA
+from numpy import float64  # NOQA
+# Not supported by CuPy:
+# from numpy import float96
+# from numpy import float128
+# -----------------------------------------------------------------------------
+# Complex floating-point numbers
+# -----------------------------------------------------------------------------
+from numpy import csingle  # NOQA
+from numpy import singlecomplex  # NOQA
+from numpy import cdouble  # NOQA
+from numpy import cfloat  # NOQA
+from numpy import complex_  # NOQA
+from numpy import complex64  # NOQA
+from numpy import complex128  # NOQA
+# Not supported by CuPy:
+# from numpy import complex192
+# from numpy import complex256
+# from numpy import clongfloat
+# -----------------------------------------------------------------------------
+# Any Python object
+# -----------------------------------------------------------------------------
+# Not supported by CuPy:
+# from numpy import object_
+# from numpy import bytes_
+# from numpy import unicode_
+# from numpy import void
+# -----------------------------------------------------------------------------
+# Built-in Python types
+# -----------------------------------------------------------------------------
+# =============================================================================
+# Routines
+#
+# The order of these declarations are borrowed from the NumPy document:
+# https://numpy.org/doc/stable/reference/routines.html
+# =============================================================================
+# -----------------------------------------------------------------------------
+# Array creation routines
+# -----------------------------------------------------------------------------
+from cupy._creation.basic import empty  # NOQA
+from cupy._creation.basic import empty_like  # NOQA
+from cupy._creation.basic import eye  # NOQA
+from cupy._creation.basic import full  # NOQA
+from cupy._creation.basic import full_like  # NOQA
+from cupy._creation.basic import identity  # NOQA
+from cupy._creation.basic import ones  # NOQA
+from cupy._creation.basic import ones_like  # NOQA
+from cupy._creation.basic import zeros  # NOQA
+from cupy._creation.basic import zeros_like  # NOQA
+from cupy._creation.from_data import copy  # NOQA
+from cupy._creation.from_data import array  # NOQA
+from cupy._creation.from_data import asanyarray  # NOQA
+from cupy._creation.from_data import asarray  # NOQA
+from cupy._creation.from_data import ascontiguousarray  # NOQA
+from cupy._creation.from_data import fromfile  # NOQA
+from cupy._creation.from_data import fromfunction  # NOQA
+from cupy._creation.from_data import fromiter  # NOQA
+from cupy._creation.from_data import frombuffer  # NOQA
+from cupy._creation.from_data import fromstring  # NOQA
+from cupy._creation.from_data import loadtxt  # NOQA
+from cupy._creation.from_data import genfromtxt  # NOQA
+from cupy._creation.ranges import arange  # NOQA
+from cupy._creation.ranges import linspace  # NOQA
+from cupy._creation.ranges import logspace  # NOQA
+from cupy._creation.ranges import meshgrid  # NOQA
+from cupy._creation.ranges import mgrid  # NOQA
+from cupy._creation.ranges import ogrid  # NOQA
+from cupy._creation.matrix import diag  # NOQA
+from cupy._creation.matrix import diagflat  # NOQA
+from cupy._creation.matrix import tri  # NOQA
+from cupy._creation.matrix import tril  # NOQA
+from cupy._creation.matrix import triu  # NOQA
+from cupy._creation.matrix import vander  # NOQA
+# -----------------------------------------------------------------------------
+# Functional routines
+# -----------------------------------------------------------------------------
+from cupy._functional.piecewise import piecewise  # NOQA
+from cupy._functional.vectorize import vectorize  # NOQA
+from cupy.lib._shape_base import apply_along_axis  # NOQA
+# -----------------------------------------------------------------------------
+# Array manipulation routines
+# -----------------------------------------------------------------------------
+from cupy._manipulation.basic import copyto  # NOQA
+from cupy._manipulation.shape import shape  # NOQA
+from cupy._manipulation.shape import ravel  # NOQA
+from cupy._manipulation.shape import reshape  # NOQA
+from cupy._manipulation.transpose import moveaxis  # NOQA
+from cupy._manipulation.transpose import rollaxis  # NOQA
+from cupy._manipulation.transpose import swapaxes  # NOQA
+from cupy._manipulation.transpose import transpose  # NOQA
+from cupy._manipulation.dims import atleast_1d  # NOQA
+from cupy._manipulation.dims import atleast_2d  # NOQA
+from cupy._manipulation.dims import atleast_3d  # NOQA
+from cupy._manipulation.dims import broadcast  # NOQA
+from cupy._manipulation.dims import broadcast_arrays  # NOQA
+from cupy._manipulation.dims import broadcast_to  # NOQA
+from cupy._manipulation.dims import expand_dims  # NOQA
+from cupy._manipulation.dims import squeeze  # NOQA
+from cupy._manipulation.join import column_stack  # NOQA
+from cupy._manipulation.join import concatenate  # NOQA
+from cupy._manipulation.join import dstack  # NOQA
+from cupy._manipulation.join import hstack  # NOQA
+from cupy._manipulation.join import stack  # NOQA
+from cupy._manipulation.join import vstack  # NOQA
+from cupy._manipulation.join import vstack as row_stack  # NOQA
+from cupy._manipulation.kind import asarray_chkfinite  # NOQA
+from cupy._manipulation.kind import asfarray  # NOQA
+from cupy._manipulation.kind import asfortranarray  # NOQA
+from cupy._manipulation.kind import require  # NOQA
+from cupy._manipulation.split import array_split  # NOQA
+from cupy._manipulation.split import dsplit  # NOQA
+from cupy._manipulation.split import hsplit  # NOQA
+from cupy._manipulation.split import split  # NOQA
+from cupy._manipulation.split import vsplit  # NOQA
+from cupy._manipulation.tiling import repeat  # NOQA
+from cupy._manipulation.tiling import tile  # NOQA
+from cupy._manipulation.add_remove import append  # NOQA
+from cupy._manipulation.add_remove import resize  # NOQA
+from cupy._manipulation.add_remove import unique  # NOQA
+from cupy._manipulation.add_remove import trim_zeros  # NOQA
+from cupy._manipulation.rearrange import flip  # NOQA
+from cupy._manipulation.rearrange import fliplr  # NOQA
+from cupy._manipulation.rearrange import flipud  # NOQA
+from cupy._manipulation.rearrange import roll  # NOQA
+from cupy._manipulation.rearrange import rot90  # NOQA
+# Borrowed from NumPy
+if hasattr(_numpy, 'broadcast_shapes'):  # NumPy 1.20
+    from numpy import broadcast_shapes  # NOQA
+# -----------------------------------------------------------------------------
+# Binary operations
+# -----------------------------------------------------------------------------
+from cupy._binary.elementwise import bitwise_and  # NOQA
+from cupy._binary.elementwise import bitwise_or  # NOQA
+from cupy._binary.elementwise import bitwise_xor  # NOQA
+from cupy._binary.elementwise import bitwise_not  # NOQA
+from cupy._binary.elementwise import invert  # NOQA
+from cupy._binary.elementwise import left_shift  # NOQA
+from cupy._binary.elementwise import right_shift  # NOQA
+from cupy._binary.packing import packbits  # NOQA
+from cupy._binary.packing import unpackbits  # NOQA
+def binary_repr(num, width=None):
+    """Return the binary representation of the input number as a string.
+    .. seealso:: :func:`numpy.binary_repr`
+    """
+    return _numpy.binary_repr(num, width)
+# -----------------------------------------------------------------------------
+# Data type routines (mostly borrowed from NumPy)
+# -----------------------------------------------------------------------------
+def can_cast(from_, to, casting='safe'):
+    """Returns True if cast between data types can occur according to the
+    casting rule. If from is a scalar or array scalar, also returns True if the
+    scalar value can be cast without overflow or truncation to an integer.
+    .. seealso:: :func:`numpy.can_cast`
+    """
+    from_ = from_.dtype if isinstance(from_, ndarray) else from_
+    return _numpy.can_cast(from_, to, casting=casting)
+def common_type(*arrays):
+    """Return a scalar type which is common to the input arrays.
+    .. seealso:: :func:`numpy.common_type`
+    """
+    if len(arrays) == 0:
+        return _numpy.float16
+    default_float_dtype = _numpy.dtype('float64')
+    dtypes = []
+    for a in arrays:
+        if a.dtype.kind == 'b':
+            raise TypeError('can\'t get common type for non-numeric array')
+        elif a.dtype.kind in 'iu':
+            dtypes.append(default_float_dtype)
+        else:
+            dtypes.append(a.dtype)
+    return _functools.reduce(_numpy.promote_types, dtypes).type
+def result_type(*arrays_and_dtypes):
+    """Returns the type that results from applying the NumPy type promotion
+    rules to the arguments.
+    .. seealso:: :func:`numpy.result_type`
+    """
+    dtypes = [a.dtype if isinstance(a, ndarray)
+              else a for a in arrays_and_dtypes]
+    return _numpy.result_type(*dtypes)
+from cupy._core.core import min_scalar_type  # NOQA
+from numpy import obj2sctype  # NOQA
+from numpy import promote_types  # NOQA
+from numpy import dtype  # NOQA
+from numpy import format_parser  # NOQA
+from numpy import finfo  # NOQA
+from numpy import iinfo  # NOQA
+from numpy import find_common_type  # NOQA
+from numpy import issctype  # NOQA
+from numpy import issubclass_  # NOQA
+from numpy import issubdtype  # NOQA
+from numpy import issubsctype  # NOQA
+from numpy import mintypecode  # NOQA
+from numpy import sctype2char  # NOQA
+from numpy import typename  # NOQA
+# -----------------------------------------------------------------------------
+# Optionally Scipy-accelerated routines
+# -----------------------------------------------------------------------------
+# TODO(beam2d): Implement it
+# -----------------------------------------------------------------------------
+# Discrete Fourier Transform
+# -----------------------------------------------------------------------------
+# TODO(beam2d): Implement it
+# -----------------------------------------------------------------------------
+# Indexing routines
+# -----------------------------------------------------------------------------
+from cupy._indexing.generate import c_  # NOQA
+from cupy._indexing.generate import indices  # NOQA
+from cupy._indexing.generate import ix_  # NOQA
+from cupy._indexing.generate import mask_indices  # NOQA
+from cupy._indexing.generate import tril_indices  # NOQA
+from cupy._indexing.generate import tril_indices_from  # NOQA
+from cupy._indexing.generate import triu_indices  # NOQA
+from cupy._indexing.generate import triu_indices_from  # NOQA
+from cupy._indexing.generate import r_  # NOQA
+from cupy._indexing.generate import ravel_multi_index  # NOQA
+from cupy._indexing.generate import unravel_index  # NOQA
+from cupy._indexing.indexing import choose  # NOQA
+from cupy._indexing.indexing import compress  # NOQA
+from cupy._indexing.indexing import diagonal  # NOQA
+from cupy._indexing.indexing import extract  # NOQA
+from cupy._indexing.indexing import select  # NOQA
+from cupy._indexing.indexing import take  # NOQA
+from cupy._indexing.indexing import take_along_axis  # NOQA
+from cupy._indexing.insert import place  # NOQA
+from cupy._indexing.insert import put  # NOQA
+from cupy._indexing.insert import putmask  # NOQA
+from cupy._indexing.insert import fill_diagonal  # NOQA
+from cupy._indexing.insert import diag_indices  # NOQA
+from cupy._indexing.insert import diag_indices_from  # NOQA
+from cupy._indexing.iterate import flatiter  # NOQA
+# Borrowed from NumPy
+from numpy import get_array_wrap  # NOQA
+from numpy import index_exp  # NOQA
+from numpy import ndindex  # NOQA
+from numpy import s_  # NOQA
+# -----------------------------------------------------------------------------
+# Input and output
+# -----------------------------------------------------------------------------
+from cupy._io.npz import load  # NOQA
+from cupy._io.npz import save  # NOQA
+from cupy._io.npz import savez  # NOQA
+from cupy._io.npz import savez_compressed  # NOQA
+from cupy._io.formatting import array_repr  # NOQA
+from cupy._io.formatting import array_str  # NOQA
+from cupy._io.formatting import array2string  # NOQA
+from cupy._io.formatting import format_float_positional  # NOQA
+from cupy._io.formatting import format_float_scientific  # NOQA
+from cupy._io.text import savetxt  # NOQA
+def base_repr(number, base=2, padding=0):  # NOQA (needed to avoid redefinition of `number`)
+    """Return a string representation of a number in the given base system.
+    .. seealso:: :func:`numpy.base_repr`
+    """
+    return _numpy.base_repr(number, base, padding)
+# Borrowed from NumPy
+from numpy import DataSource  # NOQA
+from numpy import get_printoptions  # NOQA
+from numpy import set_printoptions  # NOQA
+from numpy import printoptions  # NOQA
+from numpy import set_string_function  # NOQA
+# -----------------------------------------------------------------------------
+# Linear algebra
+# -----------------------------------------------------------------------------
+from cupy.linalg._einsum import einsum  # NOQA
+from cupy.linalg._product import cross  # NOQA
+from cupy.linalg._product import dot  # NOQA
+from cupy.linalg._product import inner  # NOQA
+from cupy.linalg._product import kron  # NOQA
+from cupy.linalg._product import matmul  # NOQA
+from cupy.linalg._product import outer  # NOQA
+from cupy.linalg._product import tensordot  # NOQA
+from cupy.linalg._product import vdot  # NOQA
+from cupy.linalg._norms import trace  # NOQA
+# -----------------------------------------------------------------------------
+# Logic functions
+# -----------------------------------------------------------------------------
+from cupy._logic.comparison import allclose  # NOQA
+from cupy._logic.comparison import array_equal  # NOQA
+from cupy._logic.comparison import array_equiv  # NOQA
+from cupy._logic.comparison import isclose  # NOQA
+from cupy._logic.content import isfinite  # NOQA
+from cupy._logic.content import isinf  # NOQA
+from cupy._logic.content import isnan  # NOQA
+from cupy._logic.content import isneginf  # NOQA
+from cupy._logic.content import isposinf  # NOQA
+from cupy._logic.truth import in1d  # NOQA
+from cupy._logic.truth import isin  # NOQA
+from cupy._logic.type_testing import iscomplex  # NOQA
+from cupy._logic.type_testing import iscomplexobj  # NOQA
+from cupy._logic.type_testing import isfortran  # NOQA
+from cupy._logic.type_testing import isreal  # NOQA
+from cupy._logic.type_testing import isrealobj  # NOQA
+from cupy._logic.truth import in1d  # NOQA
+from cupy._logic.truth import intersect1d  # NOQA
+from cupy._logic.truth import isin  # NOQA
+from cupy._logic.truth import setdiff1d  # NOQA
+from cupy._logic.truth import setxor1d  # NOQA
+from cupy._logic.truth import union1d  # NOQA
+def isscalar(element):
+    """Returns True if the type of num is a scalar type.
+    .. seealso:: :func:`numpy.isscalar`
+    """
+    return _numpy.isscalar(element)
+from cupy._logic.ops import logical_and  # NOQA
+from cupy._logic.ops import logical_not  # NOQA
+from cupy._logic.ops import logical_or  # NOQA
+from cupy._logic.ops import logical_xor  # NOQA
+from cupy._logic.comparison import equal  # NOQA
+from cupy._logic.comparison import greater  # NOQA
+from cupy._logic.comparison import greater_equal  # NOQA
+from cupy._logic.comparison import less  # NOQA
+from cupy._logic.comparison import less_equal  # NOQA
+from cupy._logic.comparison import not_equal  # NOQA
+from cupy._logic.truth import all  # NOQA
+from cupy._logic.truth import all as alltrue  # NOQA
+from cupy._logic.truth import any  # NOQA
+from cupy._logic.truth import any as sometrue  # NOQA
+# ------------------------------------------------------------------------------
+# Polynomial functions
+# ------------------------------------------------------------------------------
+from cupy.lib._polynomial import poly1d  # NOQA
+from cupy.lib._routines_poly import poly  # NOQA
+from cupy.lib._routines_poly import polyadd  # NOQA
+from cupy.lib._routines_poly import polysub  # NOQA
+from cupy.lib._routines_poly import polymul  # NOQA
+from cupy.lib._routines_poly import polyfit  # NOQA
+from cupy.lib._routines_poly import polyval  # NOQA
+from cupy.lib._routines_poly import roots  # NOQA
+# Borrowed from NumPy
+from numpy import RankWarning  # NOQA
+# -----------------------------------------------------------------------------
+# Mathematical functions
+# -----------------------------------------------------------------------------
+from cupy._math.trigonometric import arccos  # NOQA
+from cupy._math.trigonometric import arcsin  # NOQA
+from cupy._math.trigonometric import arctan  # NOQA
+from cupy._math.trigonometric import arctan2  # NOQA
+from cupy._math.trigonometric import cos  # NOQA
+from cupy._math.trigonometric import deg2rad  # NOQA
+from cupy._math.trigonometric import degrees  # NOQA
+from cupy._math.trigonometric import hypot  # NOQA
+from cupy._math.trigonometric import rad2deg  # NOQA
+from cupy._math.trigonometric import radians  # NOQA
+from cupy._math.trigonometric import sin  # NOQA
+from cupy._math.trigonometric import tan  # NOQA
+from cupy._math.trigonometric import unwrap  # NOQA
+from cupy._math.hyperbolic import arccosh  # NOQA
+from cupy._math.hyperbolic import arcsinh  # NOQA
+from cupy._math.hyperbolic import arctanh  # NOQA
+from cupy._math.hyperbolic import cosh  # NOQA
+from cupy._math.hyperbolic import sinh  # NOQA
+from cupy._math.hyperbolic import tanh  # NOQA
+from cupy._math.rounding import around  # NOQA
+from cupy._math.rounding import ceil  # NOQA
+from cupy._math.rounding import fix  # NOQA
+from cupy._math.rounding import floor  # NOQA
+from cupy._math.rounding import rint  # NOQA
+from cupy._math.rounding import round_  # NOQA
+from cupy._math.rounding import round_ as round  # NOQA
+from cupy._math.rounding import trunc  # NOQA
+from cupy._math.sumprod import prod  # NOQA
+from cupy._math.sumprod import prod as product  # NOQA
+from cupy._math.sumprod import sum  # NOQA
+from cupy._math.sumprod import cumprod  # NOQA
+from cupy._math.sumprod import cumprod as cumproduct  # NOQA
+from cupy._math.sumprod import cumsum  # NOQA
+from cupy._math.sumprod import ediff1d  # NOQA
+from cupy._math.sumprod import nancumprod  # NOQA
+from cupy._math.sumprod import nancumsum  # NOQA
+from cupy._math.sumprod import nansum  # NOQA
+from cupy._math.sumprod import nanprod  # NOQA
+from cupy._math.sumprod import diff  # NOQA
+from cupy._math.sumprod import gradient  # NOQA
+from cupy._math.sumprod import trapz  # NOQA
+from cupy._math.window import bartlett  # NOQA
+from cupy._math.window import blackman  # NOQA
+from cupy._math.window import hamming  # NOQA
+from cupy._math.window import hanning  # NOQA
+from cupy._math.window import kaiser  # NOQA
+from cupy._math.explog import exp  # NOQA
+from cupy._math.explog import exp2  # NOQA
+from cupy._math.explog import expm1  # NOQA
+from cupy._math.explog import log  # NOQA
+from cupy._math.explog import log10  # NOQA
+from cupy._math.explog import log1p  # NOQA
+from cupy._math.explog import log2  # NOQA
+from cupy._math.explog import logaddexp  # NOQA
+from cupy._math.explog import logaddexp2  # NOQA
+from cupy._math.special import i0  # NOQA
+from cupy._math.special import sinc  # NOQA
+from cupy._math.floating import copysign  # NOQA
+from cupy._math.floating import frexp  # NOQA
+from cupy._math.floating import ldexp  # NOQA
+from cupy._math.floating import nextafter  # NOQA
+from cupy._math.floating import signbit  # NOQA
+from cupy._math.rational import gcd  # NOQA
+from cupy._math.rational import lcm  # NOQA
+from cupy._math.arithmetic import add  # NOQA
+from cupy._math.arithmetic import divide  # NOQA
+from cupy._math.arithmetic import divmod  # NOQA
+from cupy._math.arithmetic import floor_divide  # NOQA
+from cupy._math.arithmetic import float_power  # NOQA
+from cupy._math.arithmetic import fmod  # NOQA
+from cupy._math.arithmetic import modf  # NOQA
+from cupy._math.arithmetic import multiply  # NOQA
+from cupy._math.arithmetic import negative  # NOQA
+from cupy._math.arithmetic import positive  # NOQA
+from cupy._math.arithmetic import power  # NOQA
+from cupy._math.arithmetic import reciprocal  # NOQA
+from cupy._math.arithmetic import remainder  # NOQA
+from cupy._math.arithmetic import remainder as mod  # NOQA
+from cupy._math.arithmetic import subtract  # NOQA
+from cupy._math.arithmetic import true_divide  # NOQA
+from cupy._math.arithmetic import angle  # NOQA
+from cupy._math.arithmetic import conjugate as conj  # NOQA
+from cupy._math.arithmetic import conjugate  # NOQA
+from cupy._math.arithmetic import imag  # NOQA
+from cupy._math.arithmetic import real  # NOQA
+from cupy._math.misc import absolute as abs  # NOQA
+from cupy._math.misc import absolute  # NOQA
+from cupy._math.misc import cbrt  # NOQA
+from cupy._math.misc import clip  # NOQA
+from cupy._math.misc import fabs  # NOQA
+from cupy._math.misc import fmax  # NOQA
+from cupy._math.misc import fmin  # NOQA
+from cupy._math.misc import interp  # NOQA
+from cupy._math.misc import maximum  # NOQA
+from cupy._math.misc import minimum  # NOQA
+from cupy._math.misc import nan_to_num  # NOQA
+from cupy._math.misc import real_if_close  # NOQA
+from cupy._math.misc import sign  # NOQA
+from cupy._math.misc import heaviside  # NOQA
+from cupy._math.misc import sqrt  # NOQA
+from cupy._math.misc import square  # NOQA
+from cupy._math.misc import convolve  # NOQA
+# -----------------------------------------------------------------------------
+# Miscellaneous routines
+# -----------------------------------------------------------------------------
+from cupy._misc.byte_bounds import byte_bounds  # NOQA
+from cupy._misc.memory_ranges import may_share_memory  # NOQA
+from cupy._misc.memory_ranges import shares_memory  # NOQA
+from cupy._misc.who import who  # NOQA
+# Borrowed from NumPy
+from numpy import disp  # NOQA
+from numpy import iterable  # NOQA
+from numpy import safe_eval  # NOQA
+from numpy import AxisError  # NOQA
+# -----------------------------------------------------------------------------
+# Padding
+# -----------------------------------------------------------------------------
+from cupy._padding.pad import pad  # NOQA
+# -----------------------------------------------------------------------------
+# Sorting, searching, and counting
+# -----------------------------------------------------------------------------
+from cupy._sorting.count import count_nonzero  # NOQA
+from cupy._sorting.search import argmax  # NOQA
+from cupy._sorting.search import argmin  # NOQA
+from cupy._sorting.search import argwhere  # NOQA
+from cupy._sorting.search import flatnonzero  # NOQA
+from cupy._sorting.search import nanargmax  # NOQA
+from cupy._sorting.search import nanargmin  # NOQA
+from cupy._sorting.search import nonzero  # NOQA
+from cupy._sorting.search import searchsorted  # NOQA
+from cupy._sorting.search import where  # NOQA
+from cupy._sorting.sort import argpartition  # NOQA
+from cupy._sorting.sort import argsort  # NOQA
+from cupy._sorting.sort import lexsort  # NOQA
+from cupy._sorting.sort import msort  # NOQA
+from cupy._sorting.sort import sort_complex  # NOQA
+from cupy._sorting.sort import partition  # NOQA
+from cupy._sorting.sort import sort  # NOQA
+# -----------------------------------------------------------------------------
+# Statistics
+# -----------------------------------------------------------------------------
+from cupy._statistics.correlation import corrcoef  # NOQA
+from cupy._statistics.correlation import cov  # NOQA
+from cupy._statistics.correlation import correlate  # NOQA
+from cupy._statistics.order import amax  # NOQA
+from cupy._statistics.order import amax as max  # NOQA
+from cupy._statistics.order import amin  # NOQA
+from cupy._statistics.order import amin as min  # NOQA
+from cupy._statistics.order import nanmax  # NOQA
+from cupy._statistics.order import nanmin  # NOQA
+from cupy._statistics.order import percentile  # NOQA
+from cupy._statistics.order import ptp  # NOQA
+from cupy._statistics.order import quantile  # NOQA
+from cupy._statistics.meanvar import median  # NOQA
+from cupy._statistics.meanvar import average  # NOQA
+from cupy._statistics.meanvar import mean  # NOQA
+from cupy._statistics.meanvar import std  # NOQA
+from cupy._statistics.meanvar import var  # NOQA
+from cupy._statistics.meanvar import nanmedian  # NOQA
+from cupy._statistics.meanvar import nanmean  # NOQA
+from cupy._statistics.meanvar import nanstd  # NOQA
+from cupy._statistics.meanvar import nanvar  # NOQA
+from cupy._statistics.histogram import bincount  # NOQA
+from cupy._statistics.histogram import digitize  # NOQA
+from cupy._statistics.histogram import histogram  # NOQA
+from cupy._statistics.histogram import histogram2d  # NOQA
+from cupy._statistics.histogram import histogramdd  # NOQA
+# -----------------------------------------------------------------------------
+# Classes without their own docs
+# -----------------------------------------------------------------------------
+from numpy import ComplexWarning  # NOQA
+from numpy import ModuleDeprecationWarning  # NOQA
+from numpy import TooHardError  # NOQA
+from numpy import VisibleDeprecationWarning  # NOQA
+# -----------------------------------------------------------------------------
+# Undocumented functions
+# -----------------------------------------------------------------------------
+from cupy._core import size  # NOQA
+def ndim(a):
+    """Returns the number of dimensions of an array.
+    Args:
+        a (array-like): If it is not already an `cupy.ndarray`, a conversion
+            via :func:`numpy.asarray` is attempted.
+    Returns:
+        (int): The number of dimensions in `a`.
+    """
+    try:
+        return a.ndim
+    except AttributeError:
+        return _numpy.ndim(a)
+# -----------------------------------------------------------------------------
+# CuPy specific functions
+# -----------------------------------------------------------------------------
+from cupy._util import clear_memo  # NOQA
+from cupy._util import memoize  # NOQA
+from cupy._core import ElementwiseKernel  # NOQA
+from cupy._core import RawKernel  # NOQA
+from cupy._core import RawModule  # NOQA
+from cupy._core._reduction import ReductionKernel  # NOQA
+# -----------------------------------------------------------------------------
+# DLPack
+# -----------------------------------------------------------------------------
+from cupy._core import fromDlpack  # NOQA
+from cupy._core import from_dlpack  # NOQA
+def asnumpy(a, stream=None, order='C', out=None):
+    """Returns an array on the host memory from an arbitrary source array.
+    Args:
+        a: Arbitrary object that can be converted to :class:`numpy.ndarray`.
+        stream (cupy.cuda.Stream): CUDA stream object. If it is specified, then
+            the device-to-host copy runs asynchronously. Otherwise, the copy is
+            synchronous. Note that if ``a`` is not a :class:`cupy.ndarray`
+            object, then this argument has no effect.
+        order ({'C', 'F', 'A'}): The desired memory layout of the host
+            array. When ``order`` is 'A', it uses 'F' if ``a`` is
+            fortran-contiguous and 'C' otherwise.
+        out (numpy.ndarray): The output array to be written to. It must have
+            compatible shape and dtype with those of ``a``'s.
+    Returns:
+        numpy.ndarray: Converted array on the host memory.
+    """
+    if isinstance(a, ndarray):
+        return a.get(stream=stream, order=order, out=out)
+    elif hasattr(a, "__cuda_array_interface__"):
+        return array(a).get(stream=stream, order=order, out=out)
+    else:
+        temp = _numpy.asarray(a, order=order)
+        if out is not None:
+            out[...] = temp
+        else:
+            out = temp
+        return out
+_cupy = _sys.modules[__name__]
+def get_array_module(*args):
+    """Returns the array module for arguments.
+    This function is used to implement CPU/GPU generic code. If at least one of
+    the arguments is a :class:`cupy.ndarray` object, the :mod:`cupy` module is
+    returned.
+    Args:
+        args: Values to determine whether NumPy or CuPy should be used.
+    Returns:
+        module: :mod:`cupy` or :mod:`numpy` is returned based on the types of
+        the arguments.
+    .. admonition:: Example
+       A NumPy/CuPy generic function can be written as follows
+       >>> def softplus(x):
+       ...     xp = cupy.get_array_module(x)
+       ...     return xp.maximum(0, x) + xp.log1p(xp.exp(-abs(x)))
+    """
+    for arg in args:
+        if isinstance(arg, (ndarray, _cupyx.scipy.sparse.spmatrix,
+                            _core.fusion._FusionVarArray,
+                            _core.new_fusion._ArrayProxy)):
+            return _cupy
+    return _numpy
+fuse = _core.fusion.fuse
+disable_experimental_feature_warning = False
+# set default allocator
+_default_memory_pool = cuda.MemoryPool()
+_default_pinned_memory_pool = cuda.PinnedMemoryPool()
+cuda.set_allocator(_default_memory_pool.malloc)
+cuda.set_pinned_memory_allocator(_default_pinned_memory_pool.malloc)
+def get_default_memory_pool():
+    """Returns CuPy default memory pool for GPU memory.
+    Returns:
+        cupy.cuda.MemoryPool: The memory pool object.
+    .. note::
+       If you want to disable memory pool, please use the following code.
+       >>> cupy.cuda.set_allocator(None)
+    """
+    return _default_memory_pool
+def get_default_pinned_memory_pool():
+    """Returns CuPy default memory pool for pinned memory.
+    Returns:
+        cupy.cuda.PinnedMemoryPool: The memory pool object.
+    .. note::
+       If you want to disable memory pool, please use the following code.
+       >>> cupy.cuda.set_pinned_memory_allocator(None)
+    """
+    return _default_pinned_memory_pool
+def show_config(*, _full=False):
+    """Prints the current runtime configuration to standard output."""
+    _sys.stdout.write(str(_cupyx.get_runtime_info(full=_full)))
+    _sys.stdout.flush()
+_deprecated_apis = [
+    'int0',
+    'uint0',
+    'bool8',
+]
+def __getattr__(name):
+    if name in _deprecated_apis:
+        return getattr(_numpy, name)
+    raise AttributeError(f"module 'cupy' has no attribute {name!r}")
--- a/cupy/_binary/__init__.py
+++ b/cupy/_binary/__init__.py
+# Functions from the following NumPy document
+# https://numpy.org/doc/stable/reference/routines.bitwise.html
--- a/cupy/_binary/elementwise.py
+++ b/cupy/_binary/elementwise.py
+from cupy import _core
+bitwise_and = _core.bitwise_and
+bitwise_or = _core.bitwise_or
+bitwise_xor = _core.bitwise_xor
+bitwise_not = _core.invert
+invert = _core.invert
+left_shift = _core.left_shift
+right_shift = _core.right_shift
--- a/cupy/_binary/packing.py
+++ b/cupy/_binary/packing.py
+import cupy
+from cupy import _core
+_packbits_kernel = {
+    'big': _core.ElementwiseKernel(
+        'raw T a, raw int32 a_size', 'uint8 packed',
+        '''for (int j = 0; j < 8; ++j) {
+                    int k = i * 8 + j;
+                    int bit = k < a_size && a[k] != 0;
+                    packed |= bit << (7 - j);
+                }''',
+        'cupy_packbits_big'
+    ),
+    'little': _core.ElementwiseKernel(
+        'raw T a, raw int32 a_size', 'uint8 packed',
+        '''for (int j = 0; j < 8; ++j) {
+                    int k = i * 8 + j;
+                    int bit = k < a_size && a[k] != 0;
+                    packed |= bit << j;
+                }''',
+        'cupy_packbits_little'
+    )
+}
+def packbits(a, axis=None, bitorder='big'):
+    """Packs the elements of a binary-valued array into bits in a uint8 array.
+    This function currently does not support ``axis`` option.
+    Args:
+        a (cupy.ndarray): Input array.
+        axis (int, optional): Not supported yet.
+        bitorder (str, optional): bit order to use when packing the array,
+            allowed values are `'little'` and `'big'`. Defaults to `'big'`.
+    Returns:
+        cupy.ndarray: The packed array.
+    .. note::
+        When the input array is empty, this function returns a copy of it,
+        i.e., the type of the output array is not necessarily always uint8.
+        This exactly follows the NumPy's behaviour (as of version 1.11),
+        alghough this is inconsistent to the documentation.
+    .. seealso:: :func:`numpy.packbits`
+    """
+    if a.dtype.kind not in 'biu':
+        raise TypeError(
+            'Expected an input array of integer or boolean data type')
+    if axis is not None:
+        raise NotImplementedError('axis option is not supported yet')
+    if bitorder not in ('big', 'little'):
+        raise ValueError("bitorder must be either 'big' or 'little'")
+    a = a.ravel()
+    packed_size = (a.size + 7) // 8
+    packed = cupy.zeros((packed_size,), dtype=cupy.uint8)
+    return _packbits_kernel[bitorder](a, a.size, packed)
+_unpackbits_kernel = {
+    'big': _core.ElementwiseKernel(
+        'raw uint8 a', 'T unpacked',
+        'unpacked = (a[i / 8] >> (7 - i % 8)) & 1;',
+        'cupy_unpackbits_big'
+    ),
+    'little': _core.ElementwiseKernel(
+        'raw uint8 a', 'T unpacked',
+        'unpacked = (a[i / 8] >> (i % 8)) & 1;',
+        'cupy_unpackbits_little'
+    )
+}
+def unpackbits(a, axis=None, bitorder='big'):
+    """Unpacks elements of a uint8 array into a binary-valued output array.
+    This function currently does not support ``axis`` option.
+    Args:
+        a (cupy.ndarray): Input array.
+        bitorder (str, optional): bit order to use when unpacking the array,
+            allowed values are `'little'` and `'big'`. Defaults to `'big'`.
+    Returns:
+        cupy.ndarray: The unpacked array.
+    .. seealso:: :func:`numpy.unpackbits`
+    """
+    if a.dtype != cupy.uint8:
+        raise TypeError('Expected an input array of unsigned byte data type')
+    if axis is not None:
+        raise NotImplementedError('axis option is not supported yet')
+    if bitorder not in ('big', 'little'):
+        raise ValueError("bitorder must be either 'big' or 'little'")
+    unpacked = cupy.ndarray((a.size * 8), dtype=cupy.uint8)
+    return _unpackbits_kernel[bitorder](a, unpacked)
--- a/cupy/_core/__init__.pxd
+++ b/cupy/_core/__init__.pxd
--- a/cupy/_core/__init__.py
+++ b/cupy/_core/__init__.py
+# mypy: ignore-errors
+from cupy._core import core  # NOQA
+from cupy._core import fusion  # NOQA
+from cupy._core import internal  # NOQA
+# internal APIs for testing and developement
+from cupy._core._accelerator import set_elementwise_accelerators  # NOQA
+from cupy._core._accelerator import set_reduction_accelerators  # NOQA
+from cupy._core._accelerator import set_routine_accelerators  # NOQA
+from cupy._core._accelerator import get_elementwise_accelerators  # NOQA
+from cupy._core._accelerator import get_reduction_accelerators  # NOQA
+from cupy._core._accelerator import get_routine_accelerators  # NOQA
+# import class and function
+from cupy._core._kernel import create_ufunc  # NOQA
+from cupy._core._kernel import ElementwiseKernel  # NOQA
+from cupy._core._kernel import ufunc  # NOQA
+from cupy._core._kernel import _get_warpsize  # NOQA
+from cupy._core._reduction import create_reduction_func  # NOQA
+from cupy._core._reduction import ReductionKernel  # NOQA
+from cupy._core._routines_binary import bitwise_and  # NOQA
+from cupy._core._routines_binary import bitwise_or  # NOQA
+from cupy._core._routines_binary import bitwise_xor  # NOQA
+from cupy._core._routines_binary import invert  # NOQA
+from cupy._core._routines_binary import left_shift  # NOQA
+from cupy._core._routines_binary import right_shift  # NOQA
+from cupy._core._routines_linalg import _mat_ptrs  # NOQA
+from cupy._core._routines_linalg import dot  # NOQA
+from cupy._core._routines_linalg import get_compute_type  # NOQA
+from cupy._core._routines_linalg import matmul  # NOQA
+from cupy._core._routines_linalg import set_compute_type  # NOQA
+from cupy._core._routines_linalg import tensordot_core  # NOQA
+from cupy._core._routines_logic import create_comparison  # NOQA
+from cupy._core._routines_logic import equal  # NOQA
+from cupy._core._routines_logic import greater  # NOQA
+from cupy._core._routines_logic import greater_equal  # NOQA
+from cupy._core._routines_logic import less  # NOQA
+from cupy._core._routines_logic import less_equal  # NOQA
+from cupy._core._routines_logic import not_equal  # NOQA
+from cupy._core._routines_manipulation import array_split  # NOQA
+from cupy._core._routines_manipulation import broadcast  # NOQA
+from cupy._core._routines_manipulation import broadcast_to  # NOQA
+from cupy._core._routines_manipulation import concatenate_method  # NOQA
+from cupy._core._routines_manipulation import moveaxis  # NOQA
+from cupy._core._routines_manipulation import rollaxis  # NOQA
+from cupy._core._routines_manipulation import size  # NOQA'
+from cupy._core._routines_math import absolute  # NOQA
+from cupy._core._routines_math import add  # NOQA
+from cupy._core._routines_math import angle, angle_deg  # NOQA
+from cupy._core._routines_math import conjugate  # NOQA
+from cupy._core._routines_math import divide  # NOQA
+from cupy._core._routines_math import floor_divide  # NOQA
+from cupy._core._routines_math import multiply  # NOQA
+from cupy._core._routines_math import negative  # NOQA
+from cupy._core._routines_math import positive  # NOQA
+from cupy._core._routines_math import power  # NOQA
+from cupy._core._routines_math import remainder  # NOQA
+from cupy._core._routines_math import sqrt  # NOQA
+from cupy._core._routines_math import subtract  # NOQA
+from cupy._core._routines_math import true_divide  # NOQA
+from cupy._core._routines_statistics import nanmax  # NOQA
+from cupy._core._routines_statistics import nanmin  # NOQA
+from cupy._core.core import _internal_ascontiguousarray  # NOQA
+from cupy._core.core import _internal_asfortranarray  # NOQA
+from cupy._core.core import array  # NOQA
+from cupy._core.core import ascontiguousarray  # NOQA
+from cupy._core.core import asfortranarray  # NOQA
+from cupy._core.core import divmod  # NOQA
+from cupy._core.core import elementwise_copy  # NOQA
+from cupy._core.core import ndarray  # NOQA
+from cupy._core.dlpack import fromDlpack  # NOQA
+from cupy._core.dlpack import from_dlpack  # NOQA
+from cupy._core.internal import complete_slice  # NOQA
+from cupy._core.internal import get_size  # NOQA
+from cupy._core.raw import RawKernel  # NOQA
+from cupy._core.raw import RawModule  # NOQA
--- a/cupy/_core/_accelerator.pxd
+++ b/cupy/_core/_accelerator.pxd
+cdef list _elementwise_accelerators
+cdef list _reduction_accelerators
+cdef list _routine_accelerators
+cpdef enum accelerator_type:
+    ACCELERATOR_CUB = 1
+    ACCELERATOR_CUTENSOR = 2
+    ACCELERATOR_CUTENSORNET = 3
--- a/cupy/_core/_accelerator.pyx
+++ b/cupy/_core/_accelerator.pyx
+import os
+from cupy_backends.cuda.api cimport runtime
+cdef list _elementwise_accelerators = []
+cdef list _reduction_accelerators = []
+cdef list _routine_accelerators = []
+cdef int _get_accelerator(accelerator) except -1:
+    if isinstance(accelerator, int):
+        return accelerator
+    if accelerator == 'cub':
+        return ACCELERATOR_CUB
+    if accelerator == 'cutensor':
+        return ACCELERATOR_CUTENSOR
+    if accelerator == 'cutensornet':
+        return ACCELERATOR_CUTENSORNET
+    raise ValueError('Unknown accelerator: {}'.format(accelerator))
+def set_elementwise_accelerators(accelerators):
+    global _elementwise_accelerators
+    _elementwise_accelerators = [_get_accelerator(b) for b in accelerators]
+def set_reduction_accelerators(accelerators):
+    global _reduction_accelerators
+    _reduction_accelerators = [_get_accelerator(b) for b in accelerators]
+def set_routine_accelerators(accelerators):
+    global _routine_accelerators
+    _routine_accelerators = [_get_accelerator(b) for b in accelerators]
+def get_elementwise_accelerators():
+    return _elementwise_accelerators
+def get_reduction_accelerators():
+    return _reduction_accelerators
+def get_routine_accelerators():
+    return _routine_accelerators
+cdef _set_default_accelerators():
+    cdef str b, accelerator_names = os.getenv(
+        'CUPY_ACCELERATORS', '' if runtime._is_hip_environment else 'cub')
+    cdef list accelerators = [b for b in accelerator_names.split(',') if b]
+    set_elementwise_accelerators(accelerators)
+    set_reduction_accelerators(accelerators)
+    set_routine_accelerators(accelerators)
+_set_default_accelerators()
--- a/cupy/_core/_carray.pxd
+++ b/cupy/_core/_carray.pxd
+cimport cython  # NOQA
+from libcpp cimport vector
+from cupy.cuda cimport function
+ctypedef vector.vector[Py_ssize_t] shape_t
+ctypedef vector.vector[Py_ssize_t] strides_t
+# this matches NPY_MAXDIMS
+# Note: we make it an enum to work around cython/cython#4369
+cdef enum: MAX_NDIM = 32
+cdef struct _CArray:
+    void* data
+    Py_ssize_t size
+    Py_ssize_t shape_and_strides[MAX_NDIM * 2]
+@cython.final
+cdef class CArray(function.CPointer):
+    cdef:
+        _CArray val
+    cdef void init(
+        self, void* data_ptr, Py_ssize_t data_size,
+        const shape_t& shape, const strides_t& strides) except*
+cdef struct _CIndexer:
+    Py_ssize_t size
+    Py_ssize_t shape_and_index[MAX_NDIM * 2]
+cdef class CIndexer(function.CPointer):
+    cdef:
+        _CIndexer val
+    cdef void init(self, Py_ssize_t size, const shape_t &shape) except*
+cdef class Indexer:
+    cdef:
+        readonly Py_ssize_t size
+        readonly shape_t shape
+        readonly bint _index_32_bits
+    cdef void init(self, const shape_t& shape)
+    cdef function.CPointer get_pointer(self)
+cdef Indexer _indexer_init(const shape_t& shape)
--- a/cupy/_core/_carray.pyx
+++ b/cupy/_core/_carray.pyx
+from cupy.cuda cimport function
+from cupy._core cimport internal
+cdef class CArray(function.CPointer):
+    cdef void init(
+            self, void* data_ptr, Py_ssize_t data_size,
+            const shape_t& shape, const strides_t& strides) except*:
+        cdef size_t ndim = shape.size()
+        assert ndim == strides.size()
+        assert ndim <= MAX_NDIM
+        cdef Py_ssize_t* shape_and_strides = (
+            self.val.shape_and_strides)
+        cdef size_t i
+        self.val.data = data_ptr
+        self.val.size = data_size
+        for i in range(ndim):
+            shape_and_strides[i] = shape[i]
+            shape_and_strides[i + ndim] = strides[i]
+        self.ptr = <void*>&self.val
+cdef class CIndexer(function.CPointer):
+    cdef void init(self, Py_ssize_t size, const shape_t &shape) except*:
+        cdef size_t ndim = shape.size()
+        assert ndim <= MAX_NDIM
+        self.val.size = size
+        cdef Py_ssize_t i
+        for i in range(<Py_ssize_t>shape.size()):
+            self.val.shape_and_index[i] = shape[i]
+        self.ptr = <void*>&self.val
+cdef class Indexer:
+    cdef void init(self, const shape_t& shape):
+        self.shape = shape
+        self.size = internal.prod(shape)
+        self._index_32_bits = self.size <= (1 << 31)
+    @property
+    def ndim(self):
+        return self.shape.size()
+    cdef function.CPointer get_pointer(self):
+        cdef CIndexer indexer = CIndexer.__new__(CIndexer)
+        indexer.init(self.size, self.shape)
+        return indexer
+cdef inline Indexer _indexer_init(const shape_t& shape):
+    cdef Indexer indexer = Indexer.__new__(Indexer)
+    indexer.init(shape)
+    return indexer
--- a/cupy/_core/_codeblock.py
+++ b/cupy/_core/_codeblock.py
+from typing import Any, List
+_CodeType = Any  # TODO(asi1024): Correct type annotation
+class CodeBlock:
+    """Code fragment for the readable format.
+    """
+    def __init__(self, head: str, codes: _CodeType) -> None:
+        self._head = '' if head == '' else head + ' '
+        self._codes = codes
+    def _to_str_list(self, indent_width: int = 0) -> List[str]:
+        codes: List[str] = []
+        codes.append(' ' * indent_width + self._head + '{')
+        for code in self._codes:
+            next_indent_width = indent_width + 2
+            if isinstance(code, str):
+                codes.append(' ' * next_indent_width + code)
+            elif isinstance(code, CodeBlock):
+                codes += code._to_str_list(indent_width=next_indent_width)
+            else:
+                assert False
+        codes.append(' ' * indent_width + '}')
+        return codes
+    def __str__(self) -> str:
+        """Emit CUDA program like the following format.
+        <<head>> {
+          <<begin codes>>
+          ...;
+          <<end codes>>
+        }
+        """
+        return '\n'.join(self._to_str_list())
--- a/cupy/_core/_cub_reduction.pxd
+++ b/cupy/_core/_cub_reduction.pxd
+from cupy._core._carray cimport shape_t
+from cupy._core._kernel cimport _TypeMap
+from cupy._core.core cimport _ndarray_base
+cdef bint _try_to_call_cub_reduction(
+    self, list in_args, list out_args, const shape_t& a_shape,
+    stream, optimize_context, tuple key,
+    map_expr, reduce_expr, post_map_expr,
+    reduce_type, _TypeMap type_map,
+    tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
+    _ndarray_base ret) except *
--- a/cupy/_core/_cub_reduction.pyx
+++ b/cupy/_core/_cub_reduction.pyx
+from cupy._core._carray cimport shape_t
+from cupy._core cimport _kernel
+from cupy._core cimport _optimize_config
+from cupy._core cimport _reduction
+from cupy._core cimport _scalar
+from cupy._core.core cimport compile_with_cache
+from cupy._core.core cimport _ndarray_base
+from cupy._core.core cimport _internal_ascontiguousarray
+from cupy._core cimport internal
+from cupy.cuda cimport cub
+from cupy.cuda cimport function
+from cupy.cuda cimport memory
+from cupy_backends.cuda.api cimport runtime
+import math
+import string
+import sys
+from cupy import _environment
+from cupy._core._kernel import _get_param_info
+from cupy.cuda import driver
+from cupy import _util
+cdef function.Function _create_cub_reduction_function(
+        name, block_size, items_per_thread,
+        reduce_type, params, arginfos, identity,
+        pre_map_expr, reduce_expr, post_map_expr,
+        _kernel._TypeMap type_map, preamble, options):
+    # A (incomplete) list of internal variables:
+    # _J            : the index of an element in the array
+    # ROCm5.3 and above requires c++14
+    if runtime._is_hip_environment:
+        options += ('--std=c++14',)
+    else:
+        # static_assert needs at least C++11 in NVRTC
+        options += ('--std=c++11',)
+    cdef str backend
+    if runtime._is_hip_environment:
+        # In ROCm, we need to set the include path. This does not work for
+        # hiprtc as of ROCm 3.5.0, so we must use hipcc.
+        options += ('-I' + _rocm_path + '/include', '-O2')
+        backend = 'nvcc'  # this is confusing...
+    elif sys.platform.startswith('win32'):
+        # See #4771. NVRTC on Windows seems to have problems in handling empty
+        # macros, so any usage like this:
+        #     #ifndef CUB_NS_PREFIX
+        #     #define CUB_NS_PREFIX
+        #     #endif
+        # will drive NVRTC nuts (error: this declaration has no storage class
+        # or type specifier). However, we cannot find a minimum reproducer to
+        # confirm this is the root cause, so we work around by using nvcc.
+        backend = 'nvcc'
+    else:
+        # use jitify + nvrtc
+        # TODO(leofang): how about simply specifying jitify=True when calling
+        # compile_with_cache()?
+        options += ('-DCUPY_USE_JITIFY',)
+        backend = 'nvrtc'
+    # TODO(leofang): try splitting the for-loop into full tiles and partial
+    # tiles to utilize LoadDirectBlockedVectorized? See, for example,
+    # https://github.com/NVlabs/cub/blob/c3cceac115c072fb63df1836ff46d8c60d9eb304/cub/agent/agent_reduce.cuh#L311-L346
+    cdef str module_code = _get_cub_header_include()
+    module_code += '''
+${type_preamble}
+${preamble}
+typedef ${reduce_type} _type_reduce;
+static_assert(sizeof(_type_reduce) <= 32,
+    "The intermediate reduction type is assumed to be at most 32 bytes.");
+// Compile-time constants for CUB template specializations
+#define ITEMS_PER_THREAD  ${items_per_thread}
+#define BLOCK_SIZE        ${block_size}
+// for hipCUB: use the hipcub namespace
+#ifdef __HIP_DEVICE_COMPILE__
+#define cub hipcub
+#endif
+#if defined FIRST_PASS
+    typedef type_in0_raw  type_mid_in;
+    typedef _type_reduce  type_mid_out;
+    #define POST_MAP(a)   out0 = a;
+#elif defined SECOND_PASS
+    typedef _type_reduce  type_mid_in;
+    typedef type_out0_raw type_mid_out;
+    #define POST_MAP(a)   (${post_map_expr})
+#else  // one-pass reduction
+    typedef type_in0_raw  type_mid_in;
+    typedef type_out0_raw type_mid_out;
+    #define POST_MAP(a)   (${post_map_expr})
+#endif
+struct _reduction_op {
+    __device__ __forceinline__ _type_reduce operator()(
+        const _type_reduce &a, const _type_reduce &b) const {
+        return ${reduce_expr};
+    }
+};
+extern "C"
+__global__ void ${name}(${params}) {
+  unsigned int _tid = threadIdx.x;
+'''
+    if pre_map_expr == 'in0':
+        module_code += '''
+  // Specialize BlockLoad type for faster (?) loading
+  typedef cub::BlockLoad<_type_reduce, BLOCK_SIZE,
+                         ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT> BlockLoadT;
+  // Shared memory for loading
+  __shared__ typename BlockLoadT::TempStorage temp_storage_load;
+'''
+    module_code += '''
+  // Specialize BlockReduce type for our thread block
+  typedef cub::BlockReduce<_type_reduce, BLOCK_SIZE> BlockReduceT;
+  // Shared memory for reduction
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+  // Declare reduction operation
+  _reduction_op op;
+  // input & output raw pointers
+  const type_mid_in* _in0 = static_cast<const type_mid_in*>(_raw_in0);
+  type_mid_out* _out0 = static_cast<type_mid_out*>(_raw_out0);
+  // Per-thread tile data
+  _type_reduce _sdata[ITEMS_PER_THREAD];
+  #pragma unroll
+  for (int j = 0; j < ITEMS_PER_THREAD; j++) {
+      _sdata[j] = _type_reduce(${identity});
+  }
+  // each block handles the reduction of 1 segment
+  size_t segment_idx = blockIdx.x * _segment_size;
+  const type_mid_in* segment_head = _in0 + segment_idx;
+  size_t i = 0;  // tile head within the segment
+  int tile_size = (BLOCK_SIZE * ITEMS_PER_THREAD < _segment_size ?
+                   BLOCK_SIZE * ITEMS_PER_THREAD :
+                   _segment_size);
+  sizeT _seg_size = _segment_size;
+  #if defined FIRST_PASS
+  // for two-pass reduction only: "last segment" is special
+  if (_array_size > 0) {
+      if (_array_size - segment_idx <= _segment_size) {
+          _seg_size = _array_size - segment_idx;
+      }
+      #ifdef __HIP_DEVICE_COMPILE__
+      // We don't understand HIP...
+      __syncthreads();  // Propagate the new value back to memory
+      #endif
+  }
+  #endif
+  // loop over tiles within 1 segment
+  _type_reduce aggregate = _type_reduce(${identity});
+  for (i = 0; i < _seg_size; i += BLOCK_SIZE * ITEMS_PER_THREAD) {
+      // for the last tile
+      if (_seg_size - i <= tile_size) { tile_size = _seg_size - i; }
+'''
+    if pre_map_expr == 'in0':
+        module_code += '''
+      // load a tile
+      BlockLoadT(temp_storage_load).Load(segment_head + i, _sdata, tile_size,
+                                         _type_reduce(${identity}));
+'''
+    else:  # pre_map_expr could be something like "in0 != type_in0_raw(0)"
+        module_code += '''
+      // load a tile
+      #pragma unroll
+      for (int j = 0; j < ITEMS_PER_THREAD; j++) {
+          // index of the element in a tile
+          int e_idx = _tid * ITEMS_PER_THREAD + j;
+          // some pre_map_expr uses _J internally...
+          #if defined FIRST_PASS
+          int _J = (segment_idx + i + e_idx);
+          #else  // only one pass
+          int _J = (segment_idx + i + e_idx) % _seg_size;
+          #endif
+          if (e_idx < tile_size) {
+              const type_mid_in in0 = *(segment_head + i + e_idx);
+              _sdata[j] = static_cast<_type_reduce>(${pre_map_expr});
+          } else {
+              _sdata[j] = _type_reduce(${identity});
+          }
+      }
+'''
+    module_code += '''
+      // Compute block reduction
+      // Note that the output is only meaningful for thread 0
+      aggregate = op(aggregate, BlockReduceT(temp_storage).Reduce(_sdata, op));
+      __syncthreads();  // for reusing temp_storage
+  }
+  if (_tid == 0) {
+      type_mid_out& out0 = *(_out0 + blockIdx.x);
+      POST_MAP(aggregate);
+  }
+}
+'''
+    module_code = string.Template(module_code).substitute(
+        name=name,
+        block_size=block_size,
+        items_per_thread=items_per_thread,
+        reduce_type=reduce_type,
+        params=_get_cub_kernel_params(params, arginfos),
+        identity=identity,
+        reduce_expr=reduce_expr,
+        pre_map_expr=pre_map_expr,
+        post_map_expr=post_map_expr,
+        type_preamble=type_map.get_typedef_code(),
+        preamble=preamble)
+    # To specify the backend, we have to explicitly spell out the default
+    # values for arch, cachd, and prepend_cupy_headers to bypass cdef/cpdef
+    # limitation...
+    module = compile_with_cache(
+        module_code, options, arch=None, cachd_dir=None,
+        prepend_cupy_headers=True, backend=backend)
+    return module.get_function(name)
+@_util.memoize(for_each_device=True)
+def _SimpleCubReductionKernel_get_cached_function(
+        map_expr, reduce_expr, post_map_expr, reduce_type,
+        params, arginfos, _kernel._TypeMap type_map,
+        name, block_size, identity, preamble,
+        options, cub_params):
+    items_per_thread = cub_params[0]
+    name = name.replace('cupy_', 'cupy_cub_')
+    name = name.replace('cupyx_', 'cupyx_cub_')
+    return _create_cub_reduction_function(
+        name, block_size, items_per_thread,
+        reduce_type, params, arginfos, identity,
+        map_expr, reduce_expr, post_map_expr,
+        type_map, preamble, options)
+cdef str _cub_path = _environment.get_cub_path()
+cdef str _nvcc_path = _environment.get_nvcc_path()
+cdef str _rocm_path = _environment.get_rocm_path()
+cdef str _hipcc_path = _environment.get_hipcc_path()
+cdef str _cub_header = None
+cdef str _get_cub_header_include():
+    global _cub_header
+    if _cub_header is not None:
+        return _cub_header
+    assert _cub_path is not None
+    if _cub_path == '<bundle>':
+        _cub_header = '''
+#include <cupy/cuda_workaround.h>
+#include <cupy/cub/cub/block/block_reduce.cuh>
+#include <cupy/cub/cub/block/block_load.cuh>
+'''
+    elif _cub_path == '<CUDA>':
+        _cub_header = '''
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_load.cuh>
+'''
+    elif _cub_path == '<ROCm>':
+        # As of ROCm 3.5.0, the block headers cannot be included by themselves
+        # (many macros left undefined), so we must use the master header.
+        _cub_header = '''
+#include <hipcub/hipcub.hpp>
+'''
+    return _cub_header
+# make it cpdef'd for unit tests
+cpdef inline tuple _can_use_cub_block_reduction(
+        list in_args, list out_args, tuple reduce_axis, tuple out_axis):
+    '''
+    If CUB BlockReduce can be used, this function returns a tuple of the needed
+    parameters, otherwise returns None.
+    '''
+    cdef tuple axis_permutes_cub
+    cdef _ndarray_base in_arr
+    cdef Py_ssize_t contiguous_size = 1
+    cdef str order
+    # detect whether CUB headers exists somewhere:
+    if _cub_path is None:
+        import warnings
+        warnings.warn('CUB headers are not found.', RuntimeWarning)
+        return None
+    # we currently support reductions with 1 input and 1 output
+    if len(in_args) != 1 or len(out_args) != 1:
+        return None
+    in_arr = in_args[0]
+    # the axes might not be sorted when we arrive here...
+    reduce_axis = tuple(sorted(reduce_axis))
+    out_axis = tuple(sorted(out_axis))
+    # check reduction axes, if not contiguous then fall back to old kernel
+    if in_arr._f_contiguous:
+        order = 'F'
+        if not cub._cub_device_segmented_reduce_axis_compatible(
+                reduce_axis, in_arr.ndim, order):
+            return None
+        axis_permutes_cub = reduce_axis + out_axis
+    elif in_arr._c_contiguous:
+        order = 'C'
+        if not cub._cub_device_segmented_reduce_axis_compatible(
+                reduce_axis, in_arr.ndim, order):
+            return None
+        axis_permutes_cub = out_axis + reduce_axis
+    else:
+        return None
+    if axis_permutes_cub != tuple(range(in_arr.ndim)):
+        return None
+    # full-reduction of N-D array: need to invoke the kernel twice
+    cdef bint full_reduction = True if len(out_axis) == 0 else False
+    # check if the number of elements is too large
+    # (ref: cupy/cupy#3309 for CUB limit)
+    for i in reduce_axis:
+        contiguous_size *= in_arr.shape[i]
+    if contiguous_size > 0x7fffffffffffffff or contiguous_size == 0:
+        return None
+    if full_reduction:
+        # assume a GPU has at most 64 GB of physical memory
+        if contiguous_size > 0x1000000000:
+            return None
+    else:
+        # the number of blocks to be launched exceeds INT_MAX:
+        if in_arr.size // contiguous_size > 0x7fffffff:
+            return None
+    # rare event (mainly for conda-forge users): nvcc is not found!
+    if not runtime._is_hip_environment:
+        if _nvcc_path is None:
+            return None
+    else:
+        if _hipcc_path is None:
+            return None
+    return (axis_permutes_cub, contiguous_size, full_reduction)
+# similar to cupy._core._kernel._get_kernel_params()
+cdef str _get_cub_kernel_params(tuple params, tuple arginfos):
+    cdef _kernel.ParameterInfo p
+    cdef _kernel._ArgInfo arginfo
+    cdef lst = []
+    cdef str c_type, c_name
+    cdef int i
+    assert len(params) == len(arginfos)
+    for i, (p, arginfo) in enumerate(zip(params, arginfos)):
+        c_name = arginfo.get_c_var_name(p)
+        if i < len(params) - 2:
+            c_type = 'const void*' if p.is_const else 'void*'
+        else:
+            # for segment size and array size
+            c_type = arginfo.get_param_c_type(p)
+        lst.append('{} {}'.format(c_type, c_name))
+    return ', '.join(lst)
+cdef Py_ssize_t _cub_default_block_size = (
+    256 if runtime._is_hip_environment else 512)
+cdef (Py_ssize_t, Py_ssize_t) _get_cub_block_specs(  # NOQA
+        Py_ssize_t contiguous_size):
+    # This is recommended in the CUB internal and should be an
+    # even number
+    items_per_thread = 4
+    # Calculate the reduction block dimensions.
+    # Ideally, we want each block to handle one segment, so:
+    # 1. block size < segment size: the block loops over the segment
+    # 2. block size >= segment size: the segment fits in the block
+    block_size = (contiguous_size + items_per_thread - 1) // items_per_thread
+    block_size = internal.clp2(block_size)
+    warp_size = 32 if not runtime._is_hip_environment else 64
+    if block_size < warp_size:
+        block_size = warp_size
+    elif block_size > _cub_default_block_size:
+        block_size = _cub_default_block_size
+    return items_per_thread, block_size
+cdef _scalar.CScalar _cub_convert_to_c_scalar(
+        Py_ssize_t segment_size, Py_ssize_t value):
+    if segment_size > 0x7fffffff:
+        return _scalar.scalar_to_c_scalar(value)
+    else:
+        return _scalar.CScalar.from_int32(value)
+cdef inline void _cub_two_pass_launch(
+        str name, Py_ssize_t block_size, Py_ssize_t segment_size,
+        Py_ssize_t items_per_thread, str reduce_type, tuple params,
+        list in_args, list out_args,
+        str identity, str pre_map_expr, str reduce_expr, str post_map_expr,
+        _kernel._TypeMap type_map, str preamble,
+        tuple options, stream) except*:
+    '''
+    Notes:
+    1. Two-pass reduction: the first pass distributes an even share over
+       a number of blocks (with block_size threads), and the second pass
+       does reduction over 1 block of threads
+    '''
+    cdef list out_args_2nd_pass = [out_args[0]]
+    cdef Py_ssize_t contiguous_size, out_block_num
+    cdef function.Function func
+    cdef memory.MemoryPointer memptr
+    cdef str post_map_expr1, post_map_expr2, f
+    cdef list inout_args
+    cdef tuple cub_params
+    cdef size_t gridx, blockx
+    cdef _ndarray_base in_arr
+    # fair share
+    contiguous_size = min(segment_size, block_size * items_per_thread)
+    out_block_num = (segment_size + contiguous_size - 1) // contiguous_size
+    assert out_block_num <= 0x7fffffff
+    # Because we can't know sizeof(reduce_type) in advance, here we
+    # conservatively assume it's 32 bytes and allocate a work area
+    memptr = memory.alloc(out_block_num * 32)
+    out_args[0] = memptr
+    # ************************ 1st pass ************************
+    name += '_pass1'
+    inout_args = [in_args[0], out_args[0],
+                  _cub_convert_to_c_scalar(segment_size, contiguous_size),
+                  _cub_convert_to_c_scalar(segment_size, segment_size)]
+    cub_params = (items_per_thread,)
+    if 'mean' in name:
+        post_map_expr1 = post_map_expr.replace('_in_ind.size()', '1.0')
+        post_map_expr1 = post_map_expr1.replace('_out_ind.size()', '1.0')
+    elif any((f in name for f in ('argmax', 'argmin'))):
+        # Workaround: in NumPy the indices are always generated based on
+        # a C-order array (since PyArray_ContiguousFromAny was called).
+        # We have to do a conversion here (?) since we do not retain the
+        # info on strides.
+        # TODO(leofang): improve this workaround
+        in_arr = in_args[0]
+        if in_arr.ndim > 1 and in_arr._f_contiguous:
+            in_arr = _internal_ascontiguousarray(in_arr)
+            inout_args[0] = in_args[0] = in_arr
+        post_map_expr1 = post_map_expr
+    else:
+        post_map_expr1 = post_map_expr
+    # Retrieve the kernel function
+    func = _SimpleCubReductionKernel_get_cached_function(
+        pre_map_expr, reduce_expr, post_map_expr1, reduce_type,
+        params,
+        _kernel._get_arginfos(inout_args),
+        type_map,
+        name, block_size, identity, preamble,
+        ('-DFIRST_PASS=1',), cub_params)
+    # Kernel arguments passed to the __global__ function.
+    gridx = <size_t>(out_block_num * block_size)
+    blockx = <size_t>block_size
+    # Launch the kernel
+    func.linear_launch(gridx, inout_args, 0, blockx, stream)
+    # ************************ 2nd pass ************************
+    name = name[:-1] + '2'
+    contiguous_size = out_block_num
+    out_block_num = 1
+    in_args = out_args
+    out_args = out_args_2nd_pass
+    inout_args = [in_args[0], out_args[0],
+                  _cub_convert_to_c_scalar(segment_size, contiguous_size),
+                  _cub_convert_to_c_scalar(segment_size, segment_size)]
+    # For mean()
+    if 'mean' in name:
+        post_map_expr2 = post_map_expr.replace('_in_ind.size()',
+                                               '_array_size')
+        post_map_expr2 = post_map_expr2.replace('_out_ind.size()', '1.0')
+    else:
+        post_map_expr2 = post_map_expr
+    # Retrieve the kernel function
+    func = _SimpleCubReductionKernel_get_cached_function(
+        'in0', reduce_expr, post_map_expr2, reduce_type,
+        params,
+        _kernel._get_arginfos(inout_args),
+        type_map,
+        name, block_size, identity, preamble,
+        ('-DSECOND_PASS=1',), cub_params)
+    # Kernel arguments passed to the __global__ function.
+    gridx = <size_t>(out_block_num * block_size)
+    blockx = <size_t>block_size
+    # Launch the kernel
+    func.linear_launch(gridx, inout_args, 0, blockx, stream)
+cdef inline void _launch_cub(
+        self, out_block_num, block_size, block_stride,
+        in_args, out_args, in_shape, out_shape, type_map,
+        map_expr, reduce_expr, post_map_expr, reduce_type,
+        stream, params, cub_params) except *:
+    cdef bint full_reduction
+    cdef Py_ssize_t contiguous_size, items_per_thread
+    cdef function.Function func
+    # Kernel arguments passed to the __global__ function.
+    items_per_thread = cub_params[0]
+    contiguous_size = cub_params[1]
+    full_reduction = cub_params[2]
+    if full_reduction:
+        _cub_two_pass_launch(
+            self.name, block_size, contiguous_size, items_per_thread,
+            reduce_type, params, in_args, out_args, self.identity,
+            map_expr, reduce_expr, post_map_expr,
+            type_map, self.preamble, (), stream)
+        return
+    else:
+        inout_args = (
+            in_args + out_args +
+            [_cub_convert_to_c_scalar(
+                contiguous_size, contiguous_size),
+             _cub_convert_to_c_scalar(
+                 contiguous_size, 0)])
+        arginfos = _kernel._get_arginfos(inout_args)
+        func = _SimpleCubReductionKernel_get_cached_function(
+            map_expr, reduce_expr, post_map_expr, reduce_type,
+            params, arginfos, type_map,
+            self.name, block_size, self.identity, self.preamble,
+            (), cub_params)
+        func.linear_launch(
+            out_block_num * block_size, inout_args, 0, block_size, stream)
+def _get_cub_optimized_params(
+        self, optimize_config, in_args, out_args, in_shape, out_shape,
+        type_map, map_expr, reduce_expr, post_map_expr, reduce_type,
+        stream, full_reduction, out_block_num, contiguous_size, params):
+    in_args = [_reduction._optimizer_copy_arg(a) for a in in_args]
+    out_args = [_reduction._optimizer_copy_arg(a) for a in out_args]
+    items_per_thread, block_size = (
+        _get_cub_block_specs(contiguous_size))
+    default_block_size_log = math.floor(math.log2(block_size))
+    default_items_per_thread = items_per_thread
+    def target_func(block_size, items_per_thread):
+        block_stride = block_size * items_per_thread
+        cub_params = (
+            items_per_thread, contiguous_size, full_reduction)
+        _launch_cub(
+            self,
+            out_block_num, block_size, block_stride, in_args, out_args,
+            in_shape, out_shape, type_map, map_expr, reduce_expr,
+            post_map_expr, reduce_type, stream, params, cub_params)
+    def suggest_func(trial):
+        block_size_log = trial.suggest_int('block_size_log', 5, 10)
+        block_size = 2 ** block_size_log
+        items_per_thread = trial.suggest_int(
+            'items_per_thread', 2, 32, step=2)
+        trial.set_user_attr('block_size', block_size)
+        return block_size, items_per_thread
+    # CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is a possible error
+    optimize_impl = optimize_config.optimize_impl
+    best = optimize_impl(
+        optimize_config, target_func, suggest_func,
+        default_best={
+            'block_size_log': default_block_size_log,
+            'items_per_thread': default_items_per_thread,
+        }, ignore_error=(driver.CUDADriverError,))
+    return best.params['items_per_thread'], best.user_attrs['block_size']
+cdef bint _try_to_call_cub_reduction(
+        self, list in_args, list out_args, const shape_t& a_shape,
+        stream, optimize_context, tuple key,
+        map_expr, reduce_expr, post_map_expr,
+        reduce_type, _kernel._TypeMap type_map,
+        tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
+        _ndarray_base ret) except *:
+    """Try to use cub.
+    Updates `ret` and returns a boolean value whether cub is used.
+    Note: input_expr and output_expr are not used in CUB kernels.
+    """
+    cdef tuple axis_permutes
+    cdef tuple params, opt_params
+    cdef shape_t in_shape
+    cdef Py_ssize_t i
+    cdef Py_ssize_t contiguous_size = -1
+    cdef Py_ssize_t block_size, block_stride, out_block_num = 0
+    # decide to use CUB or not
+    can_use_cub = _can_use_cub_block_reduction(
+        in_args, out_args, reduce_axis, out_axis)
+    if can_use_cub is None:
+        return False
+    axis_permutes, contiguous_size, full_reduction = can_use_cub
+    in_shape = _reduction._set_permuted_args(
+        in_args, axis_permutes, a_shape, self.in_params)
+    if in_args[0]._f_contiguous:
+        ret._set_contiguous_strides(ret.dtype.itemsize, False)
+        out_args[0] = ret
+    if not full_reduction:  # just need one pass
+        out_block_num = 1  # = number of segments
+        for i in out_axis:
+            out_block_num *= in_shape[i]
+        if 'mean' in self.name:
+            post_map_expr = post_map_expr.replace(
+                '_in_ind.size()', '_segment_size')
+            post_map_expr = post_map_expr.replace(
+                '_out_ind.size()', '1.0')
+    if contiguous_size > 0x7fffffff:  # INT_MAX
+        size_type = 'uint64'
+    else:
+        size_type = 'int32'
+    type_map = _kernel._TypeMap(type_map._pairs + (('sizeT', size_type),))
+    params = (self._params[0:2]
+              + _get_param_info(size_type + ' _segment_size', True)
+              + _get_param_info(size_type + ' _array_size', True))
+    # HACK for ReductionKernel:
+    # 1. input/output arguments might not be named as in0/out0
+    # 2. pre-/post- maps might not contain in0/out0
+    # 3. type_map does not contain the expected names (type_in0_raw and
+    #    type_out0_raw)
+    cdef str old_in0 = params[0].name, old_out0 = params[1].name
+    if old_in0 != 'in0' or old_out0 != 'out0':
+        # avoid overwriting self's attributes
+        params = (_get_param_info('T in0', True)
+                  + _get_param_info('T out0', False)
+                  + params[2:])
+        map_expr = map_expr.replace(old_in0, 'in0')
+        post_map_expr = post_map_expr.replace(old_out0, 'out0')
+        type_map = _kernel._TypeMap(type_map._pairs + (
+            ('type_in0_raw', in_args[0].dtype.type),
+            ('type_out0_raw', out_args[0].dtype.type),
+        ))
+    # Calculate the reduction block dimensions.
+    optimize_context = _optimize_config.get_current_context()
+    if optimize_context is None:
+        # Calculate manually
+        items_per_thread, block_size = _get_cub_block_specs(contiguous_size)
+    else:
+        # Optimize dynamically
+        key = ('cub_reduction',) + key
+        opt_params = optimize_context.get_params(key)
+        if opt_params is None:
+            opt_params = _get_cub_optimized_params(
+                self,
+                optimize_context.config, in_args, out_args,
+                in_shape, out_shape, type_map, map_expr, reduce_expr,
+                post_map_expr, reduce_type, stream,
+                full_reduction, out_block_num, contiguous_size, params)
+            optimize_context.set_params(key, opt_params)
+        items_per_thread, block_size = opt_params
+    block_stride = block_size * items_per_thread
+    cub_params = (items_per_thread, contiguous_size, full_reduction)
+    _launch_cub(
+        self,
+        out_block_num,
+        block_size,
+        block_stride,
+        in_args, out_args,
+        in_shape, out_shape,
+        type_map,
+        map_expr, reduce_expr, post_map_expr, reduce_type,
+        stream, params, cub_params)
+    return True
--- a/cupy/_core/_dtype.pxd
+++ b/cupy/_core/_dtype.pxd
+cpdef get_dtype(t)
+cpdef tuple get_dtype_with_itemsize(t)
+cpdef int to_cuda_dtype(dtype, bint is_half_allowed=*) except -1
+cpdef void _raise_if_invalid_cast(
+    from_dt,
+    to_dt,
+    str casting,
+    argname=*
+) except *
--- a/cupy/_core/_dtype.pyx
+++ b/cupy/_core/_dtype.pyx
+cimport cython  # NOQA
+import numpy
+import warnings
+from cupy_backends.cuda.api cimport runtime
+all_type_chars = '?bhilqBHILQefdFD'
+# for c in '?bhilqBHILQefdFD':
+#    print('#', c, '...', np.dtype(c).name)
+# ? ... bool
+# b ... int8
+# h ... int16
+# i ... int32
+# l ... int64  (int32 in windows)
+# q ... int64
+# B ... uint8
+# H ... uint16
+# I ... uint32
+# L ... uint64  (uint32 in windows)
+# Q ... uint64
+# e ... float16
+# f ... float32
+# d ... float64
+# F ... complex64
+# D ... complex128
+cdef dict _dtype_dict = {}
+cdef _dtype = numpy.dtype
+cdef _init_dtype_dict():
+    for i in (int, float, bool, complex, None):
+        dtype = _dtype(i)
+        _dtype_dict[i] = (dtype, dtype.itemsize)
+    for i in all_type_chars:
+        dtype = _dtype(i)
+        item = (dtype, dtype.itemsize)
+        _dtype_dict[i] = item
+        _dtype_dict[dtype.type] = item
+    for i in {str(_dtype(i)) for i in all_type_chars}:
+        dtype = _dtype(i)
+        _dtype_dict[i] = (dtype, dtype.itemsize)
+_init_dtype_dict()
+@cython.profile(False)
+cpdef get_dtype(t):
+    ret = _dtype_dict.get(t, None)
+    if ret is None:
+        return _dtype(t)
+    return ret[0]
+@cython.profile(False)
+cpdef tuple get_dtype_with_itemsize(t):
+    ret = _dtype_dict.get(t, None)
+    if ret is None:
+        t = _dtype(t)
+        return t, t.itemsize
+    return ret
+cpdef int to_cuda_dtype(dtype, bint is_half_allowed=False) except -1:
+    cdef str dtype_char
+    try:
+        dtype_char = dtype.char
+    except AttributeError:
+        dtype_char = dtype
+    if dtype_char == 'e' and is_half_allowed:
+        return runtime.CUDA_R_16F
+    elif dtype_char == 'f':
+        return runtime.CUDA_R_32F
+    elif dtype_char == 'd':
+        return runtime.CUDA_R_64F
+    elif dtype_char == 'F':
+        return runtime.CUDA_C_32F
+    elif dtype_char == 'D':
+        return runtime.CUDA_C_64F
+    elif dtype_char == 'E' and is_half_allowed:
+        # complex32, not supported in NumPy
+        return runtime.CUDA_C_16F
+    else:
+        raise TypeError('dtype is not supported: {}'.format(dtype))
+cdef _numpy_can_cast = numpy.can_cast
+cpdef void _raise_if_invalid_cast(
+    from_dt, to_dt, str casting, argname="array data"
+) except *:
+    """Raise an error if a cast is not valid.  Also checks whether the cast
+    goes from complex to real and warns if it does.
+    The error raised can be customized by giving `obj`.  May pass a (lambda)
+    function to avoid string construction on success.
+    This function exists mainly to build a similar error everywhere.
+    """
+    if from_dt is to_dt:
+        return
+    to_dt = get_dtype(to_dt)  # may still be a type not a dtype instance
+    if casting == "same_kind" and from_dt.kind == to_dt.kind:
+        # same-kind is the most common casting used and for NumPy dtypes.
+        return
+    if _numpy_can_cast(from_dt, to_dt, casting):
+        if casting == "unsafe" and from_dt.kind == "c" and to_dt.kind in "iuf":
+            # Complex warning, we are dropping the imagine part:
+            warnings.warn(
+                'Casting complex values to real discards the imaginary part',
+                numpy.ComplexWarning)
+        return
+    # Casting is not possible, raise the error
+    if not isinstance(argname, str):
+        argname = argname()
+    raise TypeError(
+        f'Cannot cast {argname} from {from_dt!r} to {to_dt!r} '
+        f'according to the rule \'{casting}\'')