Commit deb763b7 authored by root's avatar root
Browse files

clone code from github

parent 93bf084b
Pipeline #3386 canceled with stages
@inproceedings{cupy_learningsys2017,
author = "Okuta, Ryosuke and Unno, Yuya and Nishino, Daisuke and Hido, Shohei and Loomis, Crissman",
title = "CuPy: A NumPy-Compatible Library for NVIDIA GPU Calculations",
booktitle = "Proceedings of Workshop on Machine Learning Systems (LearningSys) in The Thirty-first Annual Conference on Neural Information Processing Systems (NIPS)",
year = "2017",
url = "http://learningsys.org/nips17/assets/papers/paper_16.pdf"
}
# CuPy Code of Conduct
CuPy follows the [NumFOCUS Code of Conduct][homepage] available at https://numfocus.org/code-of-conduct.
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at `dlfw@preferred.jp`.
[homepage]: https://numfocus.org/
Copyright (c) 2015 Preferred Infrastructure, Inc.
Copyright (c) 2015 Preferred Networks, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
# Contents of sdist. See also `setup.py`.
recursive-include cupy *.h *.hpp
recursive-include cupy *.pyx *.pxd *.pxi
recursive-include cupy_backends *.h *.hpp
recursive-include cupy_backends *.pyx *.pxd *.pxi
# Fail-safe to avoid including Cythoinzed sources in sdist.
recursive-exclude cupy *.cpp
recursive-exclude cupy_backends *.cpp
# Installers
recursive-include install *.py
recursive-include tests *.py
# Licenses
include LICENSE
include docs/LICENSE_THIRD_PARTY
include docs/source/license.rst
comment: false
github_checks:
annotations: false
coverage:
status:
# Disable coverage measurement for overall codebase.
project: off
# Enable coverage measurement for diff introduced in the pull-request,
# but do not mark "X" on commit status for now.
patch:
default:
target: '0%'
import functools as _functools
import sys as _sys
import numpy as _numpy
from cupy import _environment
from cupy import _version
_environment._detect_duplicate_installation() # NOQA
_environment._setup_win32_dll_directory() # NOQA
_environment._preload_library('cutensor') # NOQA
_environment._preload_library('nccl') # NOQA
try:
from cupy import _core # NOQA
except ImportError as exc:
raise ImportError(f'''
================================================================
{_environment._diagnose_import_error()}
Original error:
{type(exc).__name__}: {exc}
================================================================
''') from exc
from cupy import cuda # NOQA
# Do not make `cupy.cupyx` available because it is confusing.
import cupyx as _cupyx # NOQA
def is_available():
return cuda.is_available()
__version__ = _version.__version__
from cupy import fft # NOQA
from cupy import linalg # NOQA
from cupy import polynomial # NOQA
from cupy import random # NOQA
# `cupy.sparse` is deprecated in v8
from cupy import sparse # NOQA
from cupy import testing # NOQA # NOQA
# import class and function
from cupy._core import ndarray # NOQA
from cupy._core import ufunc # NOQA
# =============================================================================
# Constants (borrowed from NumPy)
# =============================================================================
from numpy import e # NOQA
from numpy import euler_gamma # NOQA
from numpy import inf # NOQA
from numpy import nan # NOQA
from numpy import newaxis # == None # NOQA
from numpy import pi # NOQA
# APIs to be removed in NumPy 2.0.
# Remove these when bumping the baseline API to NumPy 2.0.
# https://github.com/cupy/cupy/pull/7800
PINF = Inf = Infinity = infty = inf # NOQA
NINF = -inf # NOQA
NAN = NaN = nan # NOQA
PZERO = 0.0 # NOQA
NZERO = -0.0 # NOQA
# =============================================================================
# Data types (borrowed from NumPy)
#
# The order of these declarations are borrowed from the NumPy document:
# https://numpy.org/doc/stable/reference/arrays.scalars.html
# =============================================================================
# -----------------------------------------------------------------------------
# Generic types
# -----------------------------------------------------------------------------
from numpy import complexfloating # NOQA
from numpy import floating # NOQA
from numpy import generic # NOQA
from numpy import inexact # NOQA
from numpy import integer # NOQA
from numpy import number # NOQA
from numpy import signedinteger # NOQA
from numpy import unsignedinteger # NOQA
# Not supported by CuPy:
# from numpy import flexible
# from numpy import character
# -----------------------------------------------------------------------------
# Booleans
# -----------------------------------------------------------------------------
from numpy import bool_ # NOQA
# -----------------------------------------------------------------------------
# Integers
# -----------------------------------------------------------------------------
from numpy import byte # NOQA
from numpy import short # NOQA
from numpy import intc # NOQA
from numpy import int_ # NOQA
from numpy import longlong # NOQA
from numpy import intp # NOQA
from numpy import int8 # NOQA
from numpy import int16 # NOQA
from numpy import int32 # NOQA
from numpy import int64 # NOQA
# -----------------------------------------------------------------------------
# Unsigned integers
# -----------------------------------------------------------------------------
from numpy import ubyte # NOQA
from numpy import ushort # NOQA
from numpy import uintc # NOQA
from numpy import uint # NOQA
from numpy import ulonglong # NOQA
from numpy import uintp # NOQA
from numpy import uint8 # NOQA
from numpy import uint16 # NOQA
from numpy import uint32 # NOQA
from numpy import uint64 # NOQA
# -----------------------------------------------------------------------------
# Floating-point numbers
# -----------------------------------------------------------------------------
from numpy import half # NOQA
from numpy import single # NOQA
from numpy import double # NOQA
from numpy import float_ # NOQA
from numpy import longfloat # NOQA
from numpy import float16 # NOQA
from numpy import float32 # NOQA
from numpy import float64 # NOQA
# Not supported by CuPy:
# from numpy import float96
# from numpy import float128
# -----------------------------------------------------------------------------
# Complex floating-point numbers
# -----------------------------------------------------------------------------
from numpy import csingle # NOQA
from numpy import singlecomplex # NOQA
from numpy import cdouble # NOQA
from numpy import cfloat # NOQA
from numpy import complex_ # NOQA
from numpy import complex64 # NOQA
from numpy import complex128 # NOQA
# Not supported by CuPy:
# from numpy import complex192
# from numpy import complex256
# from numpy import clongfloat
# -----------------------------------------------------------------------------
# Any Python object
# -----------------------------------------------------------------------------
# Not supported by CuPy:
# from numpy import object_
# from numpy import bytes_
# from numpy import unicode_
# from numpy import void
# -----------------------------------------------------------------------------
# Built-in Python types
# -----------------------------------------------------------------------------
# =============================================================================
# Routines
#
# The order of these declarations are borrowed from the NumPy document:
# https://numpy.org/doc/stable/reference/routines.html
# =============================================================================
# -----------------------------------------------------------------------------
# Array creation routines
# -----------------------------------------------------------------------------
from cupy._creation.basic import empty # NOQA
from cupy._creation.basic import empty_like # NOQA
from cupy._creation.basic import eye # NOQA
from cupy._creation.basic import full # NOQA
from cupy._creation.basic import full_like # NOQA
from cupy._creation.basic import identity # NOQA
from cupy._creation.basic import ones # NOQA
from cupy._creation.basic import ones_like # NOQA
from cupy._creation.basic import zeros # NOQA
from cupy._creation.basic import zeros_like # NOQA
from cupy._creation.from_data import copy # NOQA
from cupy._creation.from_data import array # NOQA
from cupy._creation.from_data import asanyarray # NOQA
from cupy._creation.from_data import asarray # NOQA
from cupy._creation.from_data import ascontiguousarray # NOQA
from cupy._creation.from_data import fromfile # NOQA
from cupy._creation.from_data import fromfunction # NOQA
from cupy._creation.from_data import fromiter # NOQA
from cupy._creation.from_data import frombuffer # NOQA
from cupy._creation.from_data import fromstring # NOQA
from cupy._creation.from_data import loadtxt # NOQA
from cupy._creation.from_data import genfromtxt # NOQA
from cupy._creation.ranges import arange # NOQA
from cupy._creation.ranges import linspace # NOQA
from cupy._creation.ranges import logspace # NOQA
from cupy._creation.ranges import meshgrid # NOQA
from cupy._creation.ranges import mgrid # NOQA
from cupy._creation.ranges import ogrid # NOQA
from cupy._creation.matrix import diag # NOQA
from cupy._creation.matrix import diagflat # NOQA
from cupy._creation.matrix import tri # NOQA
from cupy._creation.matrix import tril # NOQA
from cupy._creation.matrix import triu # NOQA
from cupy._creation.matrix import vander # NOQA
# -----------------------------------------------------------------------------
# Functional routines
# -----------------------------------------------------------------------------
from cupy._functional.piecewise import piecewise # NOQA
from cupy._functional.vectorize import vectorize # NOQA
from cupy.lib._shape_base import apply_along_axis # NOQA
# -----------------------------------------------------------------------------
# Array manipulation routines
# -----------------------------------------------------------------------------
from cupy._manipulation.basic import copyto # NOQA
from cupy._manipulation.shape import shape # NOQA
from cupy._manipulation.shape import ravel # NOQA
from cupy._manipulation.shape import reshape # NOQA
from cupy._manipulation.transpose import moveaxis # NOQA
from cupy._manipulation.transpose import rollaxis # NOQA
from cupy._manipulation.transpose import swapaxes # NOQA
from cupy._manipulation.transpose import transpose # NOQA
from cupy._manipulation.dims import atleast_1d # NOQA
from cupy._manipulation.dims import atleast_2d # NOQA
from cupy._manipulation.dims import atleast_3d # NOQA
from cupy._manipulation.dims import broadcast # NOQA
from cupy._manipulation.dims import broadcast_arrays # NOQA
from cupy._manipulation.dims import broadcast_to # NOQA
from cupy._manipulation.dims import expand_dims # NOQA
from cupy._manipulation.dims import squeeze # NOQA
from cupy._manipulation.join import column_stack # NOQA
from cupy._manipulation.join import concatenate # NOQA
from cupy._manipulation.join import dstack # NOQA
from cupy._manipulation.join import hstack # NOQA
from cupy._manipulation.join import stack # NOQA
from cupy._manipulation.join import vstack # NOQA
from cupy._manipulation.join import vstack as row_stack # NOQA
from cupy._manipulation.kind import asarray_chkfinite # NOQA
from cupy._manipulation.kind import asfarray # NOQA
from cupy._manipulation.kind import asfortranarray # NOQA
from cupy._manipulation.kind import require # NOQA
from cupy._manipulation.split import array_split # NOQA
from cupy._manipulation.split import dsplit # NOQA
from cupy._manipulation.split import hsplit # NOQA
from cupy._manipulation.split import split # NOQA
from cupy._manipulation.split import vsplit # NOQA
from cupy._manipulation.tiling import repeat # NOQA
from cupy._manipulation.tiling import tile # NOQA
from cupy._manipulation.add_remove import append # NOQA
from cupy._manipulation.add_remove import resize # NOQA
from cupy._manipulation.add_remove import unique # NOQA
from cupy._manipulation.add_remove import trim_zeros # NOQA
from cupy._manipulation.rearrange import flip # NOQA
from cupy._manipulation.rearrange import fliplr # NOQA
from cupy._manipulation.rearrange import flipud # NOQA
from cupy._manipulation.rearrange import roll # NOQA
from cupy._manipulation.rearrange import rot90 # NOQA
# Borrowed from NumPy
if hasattr(_numpy, 'broadcast_shapes'): # NumPy 1.20
from numpy import broadcast_shapes # NOQA
# -----------------------------------------------------------------------------
# Binary operations
# -----------------------------------------------------------------------------
from cupy._binary.elementwise import bitwise_and # NOQA
from cupy._binary.elementwise import bitwise_or # NOQA
from cupy._binary.elementwise import bitwise_xor # NOQA
from cupy._binary.elementwise import bitwise_not # NOQA
from cupy._binary.elementwise import invert # NOQA
from cupy._binary.elementwise import left_shift # NOQA
from cupy._binary.elementwise import right_shift # NOQA
from cupy._binary.packing import packbits # NOQA
from cupy._binary.packing import unpackbits # NOQA
def binary_repr(num, width=None):
"""Return the binary representation of the input number as a string.
.. seealso:: :func:`numpy.binary_repr`
"""
return _numpy.binary_repr(num, width)
# -----------------------------------------------------------------------------
# Data type routines (mostly borrowed from NumPy)
# -----------------------------------------------------------------------------
def can_cast(from_, to, casting='safe'):
"""Returns True if cast between data types can occur according to the
casting rule. If from is a scalar or array scalar, also returns True if the
scalar value can be cast without overflow or truncation to an integer.
.. seealso:: :func:`numpy.can_cast`
"""
from_ = from_.dtype if isinstance(from_, ndarray) else from_
return _numpy.can_cast(from_, to, casting=casting)
def common_type(*arrays):
"""Return a scalar type which is common to the input arrays.
.. seealso:: :func:`numpy.common_type`
"""
if len(arrays) == 0:
return _numpy.float16
default_float_dtype = _numpy.dtype('float64')
dtypes = []
for a in arrays:
if a.dtype.kind == 'b':
raise TypeError('can\'t get common type for non-numeric array')
elif a.dtype.kind in 'iu':
dtypes.append(default_float_dtype)
else:
dtypes.append(a.dtype)
return _functools.reduce(_numpy.promote_types, dtypes).type
def result_type(*arrays_and_dtypes):
"""Returns the type that results from applying the NumPy type promotion
rules to the arguments.
.. seealso:: :func:`numpy.result_type`
"""
dtypes = [a.dtype if isinstance(a, ndarray)
else a for a in arrays_and_dtypes]
return _numpy.result_type(*dtypes)
from cupy._core.core import min_scalar_type # NOQA
from numpy import obj2sctype # NOQA
from numpy import promote_types # NOQA
from numpy import dtype # NOQA
from numpy import format_parser # NOQA
from numpy import finfo # NOQA
from numpy import iinfo # NOQA
from numpy import find_common_type # NOQA
from numpy import issctype # NOQA
from numpy import issubclass_ # NOQA
from numpy import issubdtype # NOQA
from numpy import issubsctype # NOQA
from numpy import mintypecode # NOQA
from numpy import sctype2char # NOQA
from numpy import typename # NOQA
# -----------------------------------------------------------------------------
# Optionally Scipy-accelerated routines
# -----------------------------------------------------------------------------
# TODO(beam2d): Implement it
# -----------------------------------------------------------------------------
# Discrete Fourier Transform
# -----------------------------------------------------------------------------
# TODO(beam2d): Implement it
# -----------------------------------------------------------------------------
# Indexing routines
# -----------------------------------------------------------------------------
from cupy._indexing.generate import c_ # NOQA
from cupy._indexing.generate import indices # NOQA
from cupy._indexing.generate import ix_ # NOQA
from cupy._indexing.generate import mask_indices # NOQA
from cupy._indexing.generate import tril_indices # NOQA
from cupy._indexing.generate import tril_indices_from # NOQA
from cupy._indexing.generate import triu_indices # NOQA
from cupy._indexing.generate import triu_indices_from # NOQA
from cupy._indexing.generate import r_ # NOQA
from cupy._indexing.generate import ravel_multi_index # NOQA
from cupy._indexing.generate import unravel_index # NOQA
from cupy._indexing.indexing import choose # NOQA
from cupy._indexing.indexing import compress # NOQA
from cupy._indexing.indexing import diagonal # NOQA
from cupy._indexing.indexing import extract # NOQA
from cupy._indexing.indexing import select # NOQA
from cupy._indexing.indexing import take # NOQA
from cupy._indexing.indexing import take_along_axis # NOQA
from cupy._indexing.insert import place # NOQA
from cupy._indexing.insert import put # NOQA
from cupy._indexing.insert import putmask # NOQA
from cupy._indexing.insert import fill_diagonal # NOQA
from cupy._indexing.insert import diag_indices # NOQA
from cupy._indexing.insert import diag_indices_from # NOQA
from cupy._indexing.iterate import flatiter # NOQA
# Borrowed from NumPy
from numpy import get_array_wrap # NOQA
from numpy import index_exp # NOQA
from numpy import ndindex # NOQA
from numpy import s_ # NOQA
# -----------------------------------------------------------------------------
# Input and output
# -----------------------------------------------------------------------------
from cupy._io.npz import load # NOQA
from cupy._io.npz import save # NOQA
from cupy._io.npz import savez # NOQA
from cupy._io.npz import savez_compressed # NOQA
from cupy._io.formatting import array_repr # NOQA
from cupy._io.formatting import array_str # NOQA
from cupy._io.formatting import array2string # NOQA
from cupy._io.formatting import format_float_positional # NOQA
from cupy._io.formatting import format_float_scientific # NOQA
from cupy._io.text import savetxt # NOQA
def base_repr(number, base=2, padding=0): # NOQA (needed to avoid redefinition of `number`)
"""Return a string representation of a number in the given base system.
.. seealso:: :func:`numpy.base_repr`
"""
return _numpy.base_repr(number, base, padding)
# Borrowed from NumPy
from numpy import DataSource # NOQA
from numpy import get_printoptions # NOQA
from numpy import set_printoptions # NOQA
from numpy import printoptions # NOQA
from numpy import set_string_function # NOQA
# -----------------------------------------------------------------------------
# Linear algebra
# -----------------------------------------------------------------------------
from cupy.linalg._einsum import einsum # NOQA
from cupy.linalg._product import cross # NOQA
from cupy.linalg._product import dot # NOQA
from cupy.linalg._product import inner # NOQA
from cupy.linalg._product import kron # NOQA
from cupy.linalg._product import matmul # NOQA
from cupy.linalg._product import outer # NOQA
from cupy.linalg._product import tensordot # NOQA
from cupy.linalg._product import vdot # NOQA
from cupy.linalg._norms import trace # NOQA
# -----------------------------------------------------------------------------
# Logic functions
# -----------------------------------------------------------------------------
from cupy._logic.comparison import allclose # NOQA
from cupy._logic.comparison import array_equal # NOQA
from cupy._logic.comparison import array_equiv # NOQA
from cupy._logic.comparison import isclose # NOQA
from cupy._logic.content import isfinite # NOQA
from cupy._logic.content import isinf # NOQA
from cupy._logic.content import isnan # NOQA
from cupy._logic.content import isneginf # NOQA
from cupy._logic.content import isposinf # NOQA
from cupy._logic.truth import in1d # NOQA
from cupy._logic.truth import isin # NOQA
from cupy._logic.type_testing import iscomplex # NOQA
from cupy._logic.type_testing import iscomplexobj # NOQA
from cupy._logic.type_testing import isfortran # NOQA
from cupy._logic.type_testing import isreal # NOQA
from cupy._logic.type_testing import isrealobj # NOQA
from cupy._logic.truth import in1d # NOQA
from cupy._logic.truth import intersect1d # NOQA
from cupy._logic.truth import isin # NOQA
from cupy._logic.truth import setdiff1d # NOQA
from cupy._logic.truth import setxor1d # NOQA
from cupy._logic.truth import union1d # NOQA
def isscalar(element):
"""Returns True if the type of num is a scalar type.
.. seealso:: :func:`numpy.isscalar`
"""
return _numpy.isscalar(element)
from cupy._logic.ops import logical_and # NOQA
from cupy._logic.ops import logical_not # NOQA
from cupy._logic.ops import logical_or # NOQA
from cupy._logic.ops import logical_xor # NOQA
from cupy._logic.comparison import equal # NOQA
from cupy._logic.comparison import greater # NOQA
from cupy._logic.comparison import greater_equal # NOQA
from cupy._logic.comparison import less # NOQA
from cupy._logic.comparison import less_equal # NOQA
from cupy._logic.comparison import not_equal # NOQA
from cupy._logic.truth import all # NOQA
from cupy._logic.truth import all as alltrue # NOQA
from cupy._logic.truth import any # NOQA
from cupy._logic.truth import any as sometrue # NOQA
# ------------------------------------------------------------------------------
# Polynomial functions
# ------------------------------------------------------------------------------
from cupy.lib._polynomial import poly1d # NOQA
from cupy.lib._routines_poly import poly # NOQA
from cupy.lib._routines_poly import polyadd # NOQA
from cupy.lib._routines_poly import polysub # NOQA
from cupy.lib._routines_poly import polymul # NOQA
from cupy.lib._routines_poly import polyfit # NOQA
from cupy.lib._routines_poly import polyval # NOQA
from cupy.lib._routines_poly import roots # NOQA
# Borrowed from NumPy
from numpy import RankWarning # NOQA
# -----------------------------------------------------------------------------
# Mathematical functions
# -----------------------------------------------------------------------------
from cupy._math.trigonometric import arccos # NOQA
from cupy._math.trigonometric import arcsin # NOQA
from cupy._math.trigonometric import arctan # NOQA
from cupy._math.trigonometric import arctan2 # NOQA
from cupy._math.trigonometric import cos # NOQA
from cupy._math.trigonometric import deg2rad # NOQA
from cupy._math.trigonometric import degrees # NOQA
from cupy._math.trigonometric import hypot # NOQA
from cupy._math.trigonometric import rad2deg # NOQA
from cupy._math.trigonometric import radians # NOQA
from cupy._math.trigonometric import sin # NOQA
from cupy._math.trigonometric import tan # NOQA
from cupy._math.trigonometric import unwrap # NOQA
from cupy._math.hyperbolic import arccosh # NOQA
from cupy._math.hyperbolic import arcsinh # NOQA
from cupy._math.hyperbolic import arctanh # NOQA
from cupy._math.hyperbolic import cosh # NOQA
from cupy._math.hyperbolic import sinh # NOQA
from cupy._math.hyperbolic import tanh # NOQA
from cupy._math.rounding import around # NOQA
from cupy._math.rounding import ceil # NOQA
from cupy._math.rounding import fix # NOQA
from cupy._math.rounding import floor # NOQA
from cupy._math.rounding import rint # NOQA
from cupy._math.rounding import round_ # NOQA
from cupy._math.rounding import round_ as round # NOQA
from cupy._math.rounding import trunc # NOQA
from cupy._math.sumprod import prod # NOQA
from cupy._math.sumprod import prod as product # NOQA
from cupy._math.sumprod import sum # NOQA
from cupy._math.sumprod import cumprod # NOQA
from cupy._math.sumprod import cumprod as cumproduct # NOQA
from cupy._math.sumprod import cumsum # NOQA
from cupy._math.sumprod import ediff1d # NOQA
from cupy._math.sumprod import nancumprod # NOQA
from cupy._math.sumprod import nancumsum # NOQA
from cupy._math.sumprod import nansum # NOQA
from cupy._math.sumprod import nanprod # NOQA
from cupy._math.sumprod import diff # NOQA
from cupy._math.sumprod import gradient # NOQA
from cupy._math.sumprod import trapz # NOQA
from cupy._math.window import bartlett # NOQA
from cupy._math.window import blackman # NOQA
from cupy._math.window import hamming # NOQA
from cupy._math.window import hanning # NOQA
from cupy._math.window import kaiser # NOQA
from cupy._math.explog import exp # NOQA
from cupy._math.explog import exp2 # NOQA
from cupy._math.explog import expm1 # NOQA
from cupy._math.explog import log # NOQA
from cupy._math.explog import log10 # NOQA
from cupy._math.explog import log1p # NOQA
from cupy._math.explog import log2 # NOQA
from cupy._math.explog import logaddexp # NOQA
from cupy._math.explog import logaddexp2 # NOQA
from cupy._math.special import i0 # NOQA
from cupy._math.special import sinc # NOQA
from cupy._math.floating import copysign # NOQA
from cupy._math.floating import frexp # NOQA
from cupy._math.floating import ldexp # NOQA
from cupy._math.floating import nextafter # NOQA
from cupy._math.floating import signbit # NOQA
from cupy._math.rational import gcd # NOQA
from cupy._math.rational import lcm # NOQA
from cupy._math.arithmetic import add # NOQA
from cupy._math.arithmetic import divide # NOQA
from cupy._math.arithmetic import divmod # NOQA
from cupy._math.arithmetic import floor_divide # NOQA
from cupy._math.arithmetic import float_power # NOQA
from cupy._math.arithmetic import fmod # NOQA
from cupy._math.arithmetic import modf # NOQA
from cupy._math.arithmetic import multiply # NOQA
from cupy._math.arithmetic import negative # NOQA
from cupy._math.arithmetic import positive # NOQA
from cupy._math.arithmetic import power # NOQA
from cupy._math.arithmetic import reciprocal # NOQA
from cupy._math.arithmetic import remainder # NOQA
from cupy._math.arithmetic import remainder as mod # NOQA
from cupy._math.arithmetic import subtract # NOQA
from cupy._math.arithmetic import true_divide # NOQA
from cupy._math.arithmetic import angle # NOQA
from cupy._math.arithmetic import conjugate as conj # NOQA
from cupy._math.arithmetic import conjugate # NOQA
from cupy._math.arithmetic import imag # NOQA
from cupy._math.arithmetic import real # NOQA
from cupy._math.misc import absolute as abs # NOQA
from cupy._math.misc import absolute # NOQA
from cupy._math.misc import cbrt # NOQA
from cupy._math.misc import clip # NOQA
from cupy._math.misc import fabs # NOQA
from cupy._math.misc import fmax # NOQA
from cupy._math.misc import fmin # NOQA
from cupy._math.misc import interp # NOQA
from cupy._math.misc import maximum # NOQA
from cupy._math.misc import minimum # NOQA
from cupy._math.misc import nan_to_num # NOQA
from cupy._math.misc import real_if_close # NOQA
from cupy._math.misc import sign # NOQA
from cupy._math.misc import heaviside # NOQA
from cupy._math.misc import sqrt # NOQA
from cupy._math.misc import square # NOQA
from cupy._math.misc import convolve # NOQA
# -----------------------------------------------------------------------------
# Miscellaneous routines
# -----------------------------------------------------------------------------
from cupy._misc.byte_bounds import byte_bounds # NOQA
from cupy._misc.memory_ranges import may_share_memory # NOQA
from cupy._misc.memory_ranges import shares_memory # NOQA
from cupy._misc.who import who # NOQA
# Borrowed from NumPy
from numpy import disp # NOQA
from numpy import iterable # NOQA
from numpy import safe_eval # NOQA
from numpy import AxisError # NOQA
# -----------------------------------------------------------------------------
# Padding
# -----------------------------------------------------------------------------
from cupy._padding.pad import pad # NOQA
# -----------------------------------------------------------------------------
# Sorting, searching, and counting
# -----------------------------------------------------------------------------
from cupy._sorting.count import count_nonzero # NOQA
from cupy._sorting.search import argmax # NOQA
from cupy._sorting.search import argmin # NOQA
from cupy._sorting.search import argwhere # NOQA
from cupy._sorting.search import flatnonzero # NOQA
from cupy._sorting.search import nanargmax # NOQA
from cupy._sorting.search import nanargmin # NOQA
from cupy._sorting.search import nonzero # NOQA
from cupy._sorting.search import searchsorted # NOQA
from cupy._sorting.search import where # NOQA
from cupy._sorting.sort import argpartition # NOQA
from cupy._sorting.sort import argsort # NOQA
from cupy._sorting.sort import lexsort # NOQA
from cupy._sorting.sort import msort # NOQA
from cupy._sorting.sort import sort_complex # NOQA
from cupy._sorting.sort import partition # NOQA
from cupy._sorting.sort import sort # NOQA
# -----------------------------------------------------------------------------
# Statistics
# -----------------------------------------------------------------------------
from cupy._statistics.correlation import corrcoef # NOQA
from cupy._statistics.correlation import cov # NOQA
from cupy._statistics.correlation import correlate # NOQA
from cupy._statistics.order import amax # NOQA
from cupy._statistics.order import amax as max # NOQA
from cupy._statistics.order import amin # NOQA
from cupy._statistics.order import amin as min # NOQA
from cupy._statistics.order import nanmax # NOQA
from cupy._statistics.order import nanmin # NOQA
from cupy._statistics.order import percentile # NOQA
from cupy._statistics.order import ptp # NOQA
from cupy._statistics.order import quantile # NOQA
from cupy._statistics.meanvar import median # NOQA
from cupy._statistics.meanvar import average # NOQA
from cupy._statistics.meanvar import mean # NOQA
from cupy._statistics.meanvar import std # NOQA
from cupy._statistics.meanvar import var # NOQA
from cupy._statistics.meanvar import nanmedian # NOQA
from cupy._statistics.meanvar import nanmean # NOQA
from cupy._statistics.meanvar import nanstd # NOQA
from cupy._statistics.meanvar import nanvar # NOQA
from cupy._statistics.histogram import bincount # NOQA
from cupy._statistics.histogram import digitize # NOQA
from cupy._statistics.histogram import histogram # NOQA
from cupy._statistics.histogram import histogram2d # NOQA
from cupy._statistics.histogram import histogramdd # NOQA
# -----------------------------------------------------------------------------
# Classes without their own docs
# -----------------------------------------------------------------------------
from numpy import ComplexWarning # NOQA
from numpy import ModuleDeprecationWarning # NOQA
from numpy import TooHardError # NOQA
from numpy import VisibleDeprecationWarning # NOQA
# -----------------------------------------------------------------------------
# Undocumented functions
# -----------------------------------------------------------------------------
from cupy._core import size # NOQA
def ndim(a):
"""Returns the number of dimensions of an array.
Args:
a (array-like): If it is not already an `cupy.ndarray`, a conversion
via :func:`numpy.asarray` is attempted.
Returns:
(int): The number of dimensions in `a`.
"""
try:
return a.ndim
except AttributeError:
return _numpy.ndim(a)
# -----------------------------------------------------------------------------
# CuPy specific functions
# -----------------------------------------------------------------------------
from cupy._util import clear_memo # NOQA
from cupy._util import memoize # NOQA
from cupy._core import ElementwiseKernel # NOQA
from cupy._core import RawKernel # NOQA
from cupy._core import RawModule # NOQA
from cupy._core._reduction import ReductionKernel # NOQA
# -----------------------------------------------------------------------------
# DLPack
# -----------------------------------------------------------------------------
from cupy._core import fromDlpack # NOQA
from cupy._core import from_dlpack # NOQA
def asnumpy(a, stream=None, order='C', out=None):
"""Returns an array on the host memory from an arbitrary source array.
Args:
a: Arbitrary object that can be converted to :class:`numpy.ndarray`.
stream (cupy.cuda.Stream): CUDA stream object. If it is specified, then
the device-to-host copy runs asynchronously. Otherwise, the copy is
synchronous. Note that if ``a`` is not a :class:`cupy.ndarray`
object, then this argument has no effect.
order ({'C', 'F', 'A'}): The desired memory layout of the host
array. When ``order`` is 'A', it uses 'F' if ``a`` is
fortran-contiguous and 'C' otherwise.
out (numpy.ndarray): The output array to be written to. It must have
compatible shape and dtype with those of ``a``'s.
Returns:
numpy.ndarray: Converted array on the host memory.
"""
if isinstance(a, ndarray):
return a.get(stream=stream, order=order, out=out)
elif hasattr(a, "__cuda_array_interface__"):
return array(a).get(stream=stream, order=order, out=out)
else:
temp = _numpy.asarray(a, order=order)
if out is not None:
out[...] = temp
else:
out = temp
return out
_cupy = _sys.modules[__name__]
def get_array_module(*args):
"""Returns the array module for arguments.
This function is used to implement CPU/GPU generic code. If at least one of
the arguments is a :class:`cupy.ndarray` object, the :mod:`cupy` module is
returned.
Args:
args: Values to determine whether NumPy or CuPy should be used.
Returns:
module: :mod:`cupy` or :mod:`numpy` is returned based on the types of
the arguments.
.. admonition:: Example
A NumPy/CuPy generic function can be written as follows
>>> def softplus(x):
... xp = cupy.get_array_module(x)
... return xp.maximum(0, x) + xp.log1p(xp.exp(-abs(x)))
"""
for arg in args:
if isinstance(arg, (ndarray, _cupyx.scipy.sparse.spmatrix,
_core.fusion._FusionVarArray,
_core.new_fusion._ArrayProxy)):
return _cupy
return _numpy
fuse = _core.fusion.fuse
disable_experimental_feature_warning = False
# set default allocator
_default_memory_pool = cuda.MemoryPool()
_default_pinned_memory_pool = cuda.PinnedMemoryPool()
cuda.set_allocator(_default_memory_pool.malloc)
cuda.set_pinned_memory_allocator(_default_pinned_memory_pool.malloc)
def get_default_memory_pool():
"""Returns CuPy default memory pool for GPU memory.
Returns:
cupy.cuda.MemoryPool: The memory pool object.
.. note::
If you want to disable memory pool, please use the following code.
>>> cupy.cuda.set_allocator(None)
"""
return _default_memory_pool
def get_default_pinned_memory_pool():
"""Returns CuPy default memory pool for pinned memory.
Returns:
cupy.cuda.PinnedMemoryPool: The memory pool object.
.. note::
If you want to disable memory pool, please use the following code.
>>> cupy.cuda.set_pinned_memory_allocator(None)
"""
return _default_pinned_memory_pool
def show_config(*, _full=False):
"""Prints the current runtime configuration to standard output."""
_sys.stdout.write(str(_cupyx.get_runtime_info(full=_full)))
_sys.stdout.flush()
_deprecated_apis = [
'int0',
'uint0',
'bool8',
]
def __getattr__(name):
if name in _deprecated_apis:
return getattr(_numpy, name)
raise AttributeError(f"module 'cupy' has no attribute {name!r}")
# Functions from the following NumPy document
# https://numpy.org/doc/stable/reference/routines.bitwise.html
from cupy import _core
bitwise_and = _core.bitwise_and
bitwise_or = _core.bitwise_or
bitwise_xor = _core.bitwise_xor
bitwise_not = _core.invert
invert = _core.invert
left_shift = _core.left_shift
right_shift = _core.right_shift
import cupy
from cupy import _core
_packbits_kernel = {
'big': _core.ElementwiseKernel(
'raw T a, raw int32 a_size', 'uint8 packed',
'''for (int j = 0; j < 8; ++j) {
int k = i * 8 + j;
int bit = k < a_size && a[k] != 0;
packed |= bit << (7 - j);
}''',
'cupy_packbits_big'
),
'little': _core.ElementwiseKernel(
'raw T a, raw int32 a_size', 'uint8 packed',
'''for (int j = 0; j < 8; ++j) {
int k = i * 8 + j;
int bit = k < a_size && a[k] != 0;
packed |= bit << j;
}''',
'cupy_packbits_little'
)
}
def packbits(a, axis=None, bitorder='big'):
"""Packs the elements of a binary-valued array into bits in a uint8 array.
This function currently does not support ``axis`` option.
Args:
a (cupy.ndarray): Input array.
axis (int, optional): Not supported yet.
bitorder (str, optional): bit order to use when packing the array,
allowed values are `'little'` and `'big'`. Defaults to `'big'`.
Returns:
cupy.ndarray: The packed array.
.. note::
When the input array is empty, this function returns a copy of it,
i.e., the type of the output array is not necessarily always uint8.
This exactly follows the NumPy's behaviour (as of version 1.11),
alghough this is inconsistent to the documentation.
.. seealso:: :func:`numpy.packbits`
"""
if a.dtype.kind not in 'biu':
raise TypeError(
'Expected an input array of integer or boolean data type')
if axis is not None:
raise NotImplementedError('axis option is not supported yet')
if bitorder not in ('big', 'little'):
raise ValueError("bitorder must be either 'big' or 'little'")
a = a.ravel()
packed_size = (a.size + 7) // 8
packed = cupy.zeros((packed_size,), dtype=cupy.uint8)
return _packbits_kernel[bitorder](a, a.size, packed)
_unpackbits_kernel = {
'big': _core.ElementwiseKernel(
'raw uint8 a', 'T unpacked',
'unpacked = (a[i / 8] >> (7 - i % 8)) & 1;',
'cupy_unpackbits_big'
),
'little': _core.ElementwiseKernel(
'raw uint8 a', 'T unpacked',
'unpacked = (a[i / 8] >> (i % 8)) & 1;',
'cupy_unpackbits_little'
)
}
def unpackbits(a, axis=None, bitorder='big'):
"""Unpacks elements of a uint8 array into a binary-valued output array.
This function currently does not support ``axis`` option.
Args:
a (cupy.ndarray): Input array.
bitorder (str, optional): bit order to use when unpacking the array,
allowed values are `'little'` and `'big'`. Defaults to `'big'`.
Returns:
cupy.ndarray: The unpacked array.
.. seealso:: :func:`numpy.unpackbits`
"""
if a.dtype != cupy.uint8:
raise TypeError('Expected an input array of unsigned byte data type')
if axis is not None:
raise NotImplementedError('axis option is not supported yet')
if bitorder not in ('big', 'little'):
raise ValueError("bitorder must be either 'big' or 'little'")
unpacked = cupy.ndarray((a.size * 8), dtype=cupy.uint8)
return _unpackbits_kernel[bitorder](a, unpacked)
# mypy: ignore-errors
from cupy._core import core # NOQA
from cupy._core import fusion # NOQA
from cupy._core import internal # NOQA
# internal APIs for testing and developement
from cupy._core._accelerator import set_elementwise_accelerators # NOQA
from cupy._core._accelerator import set_reduction_accelerators # NOQA
from cupy._core._accelerator import set_routine_accelerators # NOQA
from cupy._core._accelerator import get_elementwise_accelerators # NOQA
from cupy._core._accelerator import get_reduction_accelerators # NOQA
from cupy._core._accelerator import get_routine_accelerators # NOQA
# import class and function
from cupy._core._kernel import create_ufunc # NOQA
from cupy._core._kernel import ElementwiseKernel # NOQA
from cupy._core._kernel import ufunc # NOQA
from cupy._core._kernel import _get_warpsize # NOQA
from cupy._core._reduction import create_reduction_func # NOQA
from cupy._core._reduction import ReductionKernel # NOQA
from cupy._core._routines_binary import bitwise_and # NOQA
from cupy._core._routines_binary import bitwise_or # NOQA
from cupy._core._routines_binary import bitwise_xor # NOQA
from cupy._core._routines_binary import invert # NOQA
from cupy._core._routines_binary import left_shift # NOQA
from cupy._core._routines_binary import right_shift # NOQA
from cupy._core._routines_linalg import _mat_ptrs # NOQA
from cupy._core._routines_linalg import dot # NOQA
from cupy._core._routines_linalg import get_compute_type # NOQA
from cupy._core._routines_linalg import matmul # NOQA
from cupy._core._routines_linalg import set_compute_type # NOQA
from cupy._core._routines_linalg import tensordot_core # NOQA
from cupy._core._routines_logic import create_comparison # NOQA
from cupy._core._routines_logic import equal # NOQA
from cupy._core._routines_logic import greater # NOQA
from cupy._core._routines_logic import greater_equal # NOQA
from cupy._core._routines_logic import less # NOQA
from cupy._core._routines_logic import less_equal # NOQA
from cupy._core._routines_logic import not_equal # NOQA
from cupy._core._routines_manipulation import array_split # NOQA
from cupy._core._routines_manipulation import broadcast # NOQA
from cupy._core._routines_manipulation import broadcast_to # NOQA
from cupy._core._routines_manipulation import concatenate_method # NOQA
from cupy._core._routines_manipulation import moveaxis # NOQA
from cupy._core._routines_manipulation import rollaxis # NOQA
from cupy._core._routines_manipulation import size # NOQA'
from cupy._core._routines_math import absolute # NOQA
from cupy._core._routines_math import add # NOQA
from cupy._core._routines_math import angle, angle_deg # NOQA
from cupy._core._routines_math import conjugate # NOQA
from cupy._core._routines_math import divide # NOQA
from cupy._core._routines_math import floor_divide # NOQA
from cupy._core._routines_math import multiply # NOQA
from cupy._core._routines_math import negative # NOQA
from cupy._core._routines_math import positive # NOQA
from cupy._core._routines_math import power # NOQA
from cupy._core._routines_math import remainder # NOQA
from cupy._core._routines_math import sqrt # NOQA
from cupy._core._routines_math import subtract # NOQA
from cupy._core._routines_math import true_divide # NOQA
from cupy._core._routines_statistics import nanmax # NOQA
from cupy._core._routines_statistics import nanmin # NOQA
from cupy._core.core import _internal_ascontiguousarray # NOQA
from cupy._core.core import _internal_asfortranarray # NOQA
from cupy._core.core import array # NOQA
from cupy._core.core import ascontiguousarray # NOQA
from cupy._core.core import asfortranarray # NOQA
from cupy._core.core import divmod # NOQA
from cupy._core.core import elementwise_copy # NOQA
from cupy._core.core import ndarray # NOQA
from cupy._core.dlpack import fromDlpack # NOQA
from cupy._core.dlpack import from_dlpack # NOQA
from cupy._core.internal import complete_slice # NOQA
from cupy._core.internal import get_size # NOQA
from cupy._core.raw import RawKernel # NOQA
from cupy._core.raw import RawModule # NOQA
cdef list _elementwise_accelerators
cdef list _reduction_accelerators
cdef list _routine_accelerators
cpdef enum accelerator_type:
ACCELERATOR_CUB = 1
ACCELERATOR_CUTENSOR = 2
ACCELERATOR_CUTENSORNET = 3
import os
from cupy_backends.cuda.api cimport runtime
cdef list _elementwise_accelerators = []
cdef list _reduction_accelerators = []
cdef list _routine_accelerators = []
cdef int _get_accelerator(accelerator) except -1:
if isinstance(accelerator, int):
return accelerator
if accelerator == 'cub':
return ACCELERATOR_CUB
if accelerator == 'cutensor':
return ACCELERATOR_CUTENSOR
if accelerator == 'cutensornet':
return ACCELERATOR_CUTENSORNET
raise ValueError('Unknown accelerator: {}'.format(accelerator))
def set_elementwise_accelerators(accelerators):
global _elementwise_accelerators
_elementwise_accelerators = [_get_accelerator(b) for b in accelerators]
def set_reduction_accelerators(accelerators):
global _reduction_accelerators
_reduction_accelerators = [_get_accelerator(b) for b in accelerators]
def set_routine_accelerators(accelerators):
global _routine_accelerators
_routine_accelerators = [_get_accelerator(b) for b in accelerators]
def get_elementwise_accelerators():
return _elementwise_accelerators
def get_reduction_accelerators():
return _reduction_accelerators
def get_routine_accelerators():
return _routine_accelerators
cdef _set_default_accelerators():
cdef str b, accelerator_names = os.getenv(
'CUPY_ACCELERATORS', '' if runtime._is_hip_environment else 'cub')
cdef list accelerators = [b for b in accelerator_names.split(',') if b]
set_elementwise_accelerators(accelerators)
set_reduction_accelerators(accelerators)
set_routine_accelerators(accelerators)
_set_default_accelerators()
cimport cython # NOQA
from libcpp cimport vector
from cupy.cuda cimport function
ctypedef vector.vector[Py_ssize_t] shape_t
ctypedef vector.vector[Py_ssize_t] strides_t
# this matches NPY_MAXDIMS
# Note: we make it an enum to work around cython/cython#4369
cdef enum: MAX_NDIM = 32
cdef struct _CArray:
void* data
Py_ssize_t size
Py_ssize_t shape_and_strides[MAX_NDIM * 2]
@cython.final
cdef class CArray(function.CPointer):
cdef:
_CArray val
cdef void init(
self, void* data_ptr, Py_ssize_t data_size,
const shape_t& shape, const strides_t& strides) except*
cdef struct _CIndexer:
Py_ssize_t size
Py_ssize_t shape_and_index[MAX_NDIM * 2]
cdef class CIndexer(function.CPointer):
cdef:
_CIndexer val
cdef void init(self, Py_ssize_t size, const shape_t &shape) except*
cdef class Indexer:
cdef:
readonly Py_ssize_t size
readonly shape_t shape
readonly bint _index_32_bits
cdef void init(self, const shape_t& shape)
cdef function.CPointer get_pointer(self)
cdef Indexer _indexer_init(const shape_t& shape)
from cupy.cuda cimport function
from cupy._core cimport internal
cdef class CArray(function.CPointer):
cdef void init(
self, void* data_ptr, Py_ssize_t data_size,
const shape_t& shape, const strides_t& strides) except*:
cdef size_t ndim = shape.size()
assert ndim == strides.size()
assert ndim <= MAX_NDIM
cdef Py_ssize_t* shape_and_strides = (
self.val.shape_and_strides)
cdef size_t i
self.val.data = data_ptr
self.val.size = data_size
for i in range(ndim):
shape_and_strides[i] = shape[i]
shape_and_strides[i + ndim] = strides[i]
self.ptr = <void*>&self.val
cdef class CIndexer(function.CPointer):
cdef void init(self, Py_ssize_t size, const shape_t &shape) except*:
cdef size_t ndim = shape.size()
assert ndim <= MAX_NDIM
self.val.size = size
cdef Py_ssize_t i
for i in range(<Py_ssize_t>shape.size()):
self.val.shape_and_index[i] = shape[i]
self.ptr = <void*>&self.val
cdef class Indexer:
cdef void init(self, const shape_t& shape):
self.shape = shape
self.size = internal.prod(shape)
self._index_32_bits = self.size <= (1 << 31)
@property
def ndim(self):
return self.shape.size()
cdef function.CPointer get_pointer(self):
cdef CIndexer indexer = CIndexer.__new__(CIndexer)
indexer.init(self.size, self.shape)
return indexer
cdef inline Indexer _indexer_init(const shape_t& shape):
cdef Indexer indexer = Indexer.__new__(Indexer)
indexer.init(shape)
return indexer
from typing import Any, List
_CodeType = Any # TODO(asi1024): Correct type annotation
class CodeBlock:
"""Code fragment for the readable format.
"""
def __init__(self, head: str, codes: _CodeType) -> None:
self._head = '' if head == '' else head + ' '
self._codes = codes
def _to_str_list(self, indent_width: int = 0) -> List[str]:
codes: List[str] = []
codes.append(' ' * indent_width + self._head + '{')
for code in self._codes:
next_indent_width = indent_width + 2
if isinstance(code, str):
codes.append(' ' * next_indent_width + code)
elif isinstance(code, CodeBlock):
codes += code._to_str_list(indent_width=next_indent_width)
else:
assert False
codes.append(' ' * indent_width + '}')
return codes
def __str__(self) -> str:
"""Emit CUDA program like the following format.
<<head>> {
<<begin codes>>
...;
<<end codes>>
}
"""
return '\n'.join(self._to_str_list())
from cupy._core._carray cimport shape_t
from cupy._core._kernel cimport _TypeMap
from cupy._core.core cimport _ndarray_base
cdef bint _try_to_call_cub_reduction(
self, list in_args, list out_args, const shape_t& a_shape,
stream, optimize_context, tuple key,
map_expr, reduce_expr, post_map_expr,
reduce_type, _TypeMap type_map,
tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
_ndarray_base ret) except *
from cupy._core._carray cimport shape_t
from cupy._core cimport _kernel
from cupy._core cimport _optimize_config
from cupy._core cimport _reduction
from cupy._core cimport _scalar
from cupy._core.core cimport compile_with_cache
from cupy._core.core cimport _ndarray_base
from cupy._core.core cimport _internal_ascontiguousarray
from cupy._core cimport internal
from cupy.cuda cimport cub
from cupy.cuda cimport function
from cupy.cuda cimport memory
from cupy_backends.cuda.api cimport runtime
import math
import string
import sys
from cupy import _environment
from cupy._core._kernel import _get_param_info
from cupy.cuda import driver
from cupy import _util
cdef function.Function _create_cub_reduction_function(
name, block_size, items_per_thread,
reduce_type, params, arginfos, identity,
pre_map_expr, reduce_expr, post_map_expr,
_kernel._TypeMap type_map, preamble, options):
# A (incomplete) list of internal variables:
# _J : the index of an element in the array
# ROCm5.3 and above requires c++14
if runtime._is_hip_environment:
options += ('--std=c++14',)
else:
# static_assert needs at least C++11 in NVRTC
options += ('--std=c++11',)
cdef str backend
if runtime._is_hip_environment:
# In ROCm, we need to set the include path. This does not work for
# hiprtc as of ROCm 3.5.0, so we must use hipcc.
options += ('-I' + _rocm_path + '/include', '-O2')
backend = 'nvcc' # this is confusing...
elif sys.platform.startswith('win32'):
# See #4771. NVRTC on Windows seems to have problems in handling empty
# macros, so any usage like this:
# #ifndef CUB_NS_PREFIX
# #define CUB_NS_PREFIX
# #endif
# will drive NVRTC nuts (error: this declaration has no storage class
# or type specifier). However, we cannot find a minimum reproducer to
# confirm this is the root cause, so we work around by using nvcc.
backend = 'nvcc'
else:
# use jitify + nvrtc
# TODO(leofang): how about simply specifying jitify=True when calling
# compile_with_cache()?
options += ('-DCUPY_USE_JITIFY',)
backend = 'nvrtc'
# TODO(leofang): try splitting the for-loop into full tiles and partial
# tiles to utilize LoadDirectBlockedVectorized? See, for example,
# https://github.com/NVlabs/cub/blob/c3cceac115c072fb63df1836ff46d8c60d9eb304/cub/agent/agent_reduce.cuh#L311-L346
cdef str module_code = _get_cub_header_include()
module_code += '''
${type_preamble}
${preamble}
typedef ${reduce_type} _type_reduce;
static_assert(sizeof(_type_reduce) <= 32,
"The intermediate reduction type is assumed to be at most 32 bytes.");
// Compile-time constants for CUB template specializations
#define ITEMS_PER_THREAD ${items_per_thread}
#define BLOCK_SIZE ${block_size}
// for hipCUB: use the hipcub namespace
#ifdef __HIP_DEVICE_COMPILE__
#define cub hipcub
#endif
#if defined FIRST_PASS
typedef type_in0_raw type_mid_in;
typedef _type_reduce type_mid_out;
#define POST_MAP(a) out0 = a;
#elif defined SECOND_PASS
typedef _type_reduce type_mid_in;
typedef type_out0_raw type_mid_out;
#define POST_MAP(a) (${post_map_expr})
#else // one-pass reduction
typedef type_in0_raw type_mid_in;
typedef type_out0_raw type_mid_out;
#define POST_MAP(a) (${post_map_expr})
#endif
struct _reduction_op {
__device__ __forceinline__ _type_reduce operator()(
const _type_reduce &a, const _type_reduce &b) const {
return ${reduce_expr};
}
};
extern "C"
__global__ void ${name}(${params}) {
unsigned int _tid = threadIdx.x;
'''
if pre_map_expr == 'in0':
module_code += '''
// Specialize BlockLoad type for faster (?) loading
typedef cub::BlockLoad<_type_reduce, BLOCK_SIZE,
ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT> BlockLoadT;
// Shared memory for loading
__shared__ typename BlockLoadT::TempStorage temp_storage_load;
'''
module_code += '''
// Specialize BlockReduce type for our thread block
typedef cub::BlockReduce<_type_reduce, BLOCK_SIZE> BlockReduceT;
// Shared memory for reduction
__shared__ typename BlockReduceT::TempStorage temp_storage;
// Declare reduction operation
_reduction_op op;
// input & output raw pointers
const type_mid_in* _in0 = static_cast<const type_mid_in*>(_raw_in0);
type_mid_out* _out0 = static_cast<type_mid_out*>(_raw_out0);
// Per-thread tile data
_type_reduce _sdata[ITEMS_PER_THREAD];
#pragma unroll
for (int j = 0; j < ITEMS_PER_THREAD; j++) {
_sdata[j] = _type_reduce(${identity});
}
// each block handles the reduction of 1 segment
size_t segment_idx = blockIdx.x * _segment_size;
const type_mid_in* segment_head = _in0 + segment_idx;
size_t i = 0; // tile head within the segment
int tile_size = (BLOCK_SIZE * ITEMS_PER_THREAD < _segment_size ?
BLOCK_SIZE * ITEMS_PER_THREAD :
_segment_size);
sizeT _seg_size = _segment_size;
#if defined FIRST_PASS
// for two-pass reduction only: "last segment" is special
if (_array_size > 0) {
if (_array_size - segment_idx <= _segment_size) {
_seg_size = _array_size - segment_idx;
}
#ifdef __HIP_DEVICE_COMPILE__
// We don't understand HIP...
__syncthreads(); // Propagate the new value back to memory
#endif
}
#endif
// loop over tiles within 1 segment
_type_reduce aggregate = _type_reduce(${identity});
for (i = 0; i < _seg_size; i += BLOCK_SIZE * ITEMS_PER_THREAD) {
// for the last tile
if (_seg_size - i <= tile_size) { tile_size = _seg_size - i; }
'''
if pre_map_expr == 'in0':
module_code += '''
// load a tile
BlockLoadT(temp_storage_load).Load(segment_head + i, _sdata, tile_size,
_type_reduce(${identity}));
'''
else: # pre_map_expr could be something like "in0 != type_in0_raw(0)"
module_code += '''
// load a tile
#pragma unroll
for (int j = 0; j < ITEMS_PER_THREAD; j++) {
// index of the element in a tile
int e_idx = _tid * ITEMS_PER_THREAD + j;
// some pre_map_expr uses _J internally...
#if defined FIRST_PASS
int _J = (segment_idx + i + e_idx);
#else // only one pass
int _J = (segment_idx + i + e_idx) % _seg_size;
#endif
if (e_idx < tile_size) {
const type_mid_in in0 = *(segment_head + i + e_idx);
_sdata[j] = static_cast<_type_reduce>(${pre_map_expr});
} else {
_sdata[j] = _type_reduce(${identity});
}
}
'''
module_code += '''
// Compute block reduction
// Note that the output is only meaningful for thread 0
aggregate = op(aggregate, BlockReduceT(temp_storage).Reduce(_sdata, op));
__syncthreads(); // for reusing temp_storage
}
if (_tid == 0) {
type_mid_out& out0 = *(_out0 + blockIdx.x);
POST_MAP(aggregate);
}
}
'''
module_code = string.Template(module_code).substitute(
name=name,
block_size=block_size,
items_per_thread=items_per_thread,
reduce_type=reduce_type,
params=_get_cub_kernel_params(params, arginfos),
identity=identity,
reduce_expr=reduce_expr,
pre_map_expr=pre_map_expr,
post_map_expr=post_map_expr,
type_preamble=type_map.get_typedef_code(),
preamble=preamble)
# To specify the backend, we have to explicitly spell out the default
# values for arch, cachd, and prepend_cupy_headers to bypass cdef/cpdef
# limitation...
module = compile_with_cache(
module_code, options, arch=None, cachd_dir=None,
prepend_cupy_headers=True, backend=backend)
return module.get_function(name)
@_util.memoize(for_each_device=True)
def _SimpleCubReductionKernel_get_cached_function(
map_expr, reduce_expr, post_map_expr, reduce_type,
params, arginfos, _kernel._TypeMap type_map,
name, block_size, identity, preamble,
options, cub_params):
items_per_thread = cub_params[0]
name = name.replace('cupy_', 'cupy_cub_')
name = name.replace('cupyx_', 'cupyx_cub_')
return _create_cub_reduction_function(
name, block_size, items_per_thread,
reduce_type, params, arginfos, identity,
map_expr, reduce_expr, post_map_expr,
type_map, preamble, options)
cdef str _cub_path = _environment.get_cub_path()
cdef str _nvcc_path = _environment.get_nvcc_path()
cdef str _rocm_path = _environment.get_rocm_path()
cdef str _hipcc_path = _environment.get_hipcc_path()
cdef str _cub_header = None
cdef str _get_cub_header_include():
global _cub_header
if _cub_header is not None:
return _cub_header
assert _cub_path is not None
if _cub_path == '<bundle>':
_cub_header = '''
#include <cupy/cuda_workaround.h>
#include <cupy/cub/cub/block/block_reduce.cuh>
#include <cupy/cub/cub/block/block_load.cuh>
'''
elif _cub_path == '<CUDA>':
_cub_header = '''
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
'''
elif _cub_path == '<ROCm>':
# As of ROCm 3.5.0, the block headers cannot be included by themselves
# (many macros left undefined), so we must use the master header.
_cub_header = '''
#include <hipcub/hipcub.hpp>
'''
return _cub_header
# make it cpdef'd for unit tests
cpdef inline tuple _can_use_cub_block_reduction(
list in_args, list out_args, tuple reduce_axis, tuple out_axis):
'''
If CUB BlockReduce can be used, this function returns a tuple of the needed
parameters, otherwise returns None.
'''
cdef tuple axis_permutes_cub
cdef _ndarray_base in_arr
cdef Py_ssize_t contiguous_size = 1
cdef str order
# detect whether CUB headers exists somewhere:
if _cub_path is None:
import warnings
warnings.warn('CUB headers are not found.', RuntimeWarning)
return None
# we currently support reductions with 1 input and 1 output
if len(in_args) != 1 or len(out_args) != 1:
return None
in_arr = in_args[0]
# the axes might not be sorted when we arrive here...
reduce_axis = tuple(sorted(reduce_axis))
out_axis = tuple(sorted(out_axis))
# check reduction axes, if not contiguous then fall back to old kernel
if in_arr._f_contiguous:
order = 'F'
if not cub._cub_device_segmented_reduce_axis_compatible(
reduce_axis, in_arr.ndim, order):
return None
axis_permutes_cub = reduce_axis + out_axis
elif in_arr._c_contiguous:
order = 'C'
if not cub._cub_device_segmented_reduce_axis_compatible(
reduce_axis, in_arr.ndim, order):
return None
axis_permutes_cub = out_axis + reduce_axis
else:
return None
if axis_permutes_cub != tuple(range(in_arr.ndim)):
return None
# full-reduction of N-D array: need to invoke the kernel twice
cdef bint full_reduction = True if len(out_axis) == 0 else False
# check if the number of elements is too large
# (ref: cupy/cupy#3309 for CUB limit)
for i in reduce_axis:
contiguous_size *= in_arr.shape[i]
if contiguous_size > 0x7fffffffffffffff or contiguous_size == 0:
return None
if full_reduction:
# assume a GPU has at most 64 GB of physical memory
if contiguous_size > 0x1000000000:
return None
else:
# the number of blocks to be launched exceeds INT_MAX:
if in_arr.size // contiguous_size > 0x7fffffff:
return None
# rare event (mainly for conda-forge users): nvcc is not found!
if not runtime._is_hip_environment:
if _nvcc_path is None:
return None
else:
if _hipcc_path is None:
return None
return (axis_permutes_cub, contiguous_size, full_reduction)
# similar to cupy._core._kernel._get_kernel_params()
cdef str _get_cub_kernel_params(tuple params, tuple arginfos):
cdef _kernel.ParameterInfo p
cdef _kernel._ArgInfo arginfo
cdef lst = []
cdef str c_type, c_name
cdef int i
assert len(params) == len(arginfos)
for i, (p, arginfo) in enumerate(zip(params, arginfos)):
c_name = arginfo.get_c_var_name(p)
if i < len(params) - 2:
c_type = 'const void*' if p.is_const else 'void*'
else:
# for segment size and array size
c_type = arginfo.get_param_c_type(p)
lst.append('{} {}'.format(c_type, c_name))
return ', '.join(lst)
cdef Py_ssize_t _cub_default_block_size = (
256 if runtime._is_hip_environment else 512)
cdef (Py_ssize_t, Py_ssize_t) _get_cub_block_specs( # NOQA
Py_ssize_t contiguous_size):
# This is recommended in the CUB internal and should be an
# even number
items_per_thread = 4
# Calculate the reduction block dimensions.
# Ideally, we want each block to handle one segment, so:
# 1. block size < segment size: the block loops over the segment
# 2. block size >= segment size: the segment fits in the block
block_size = (contiguous_size + items_per_thread - 1) // items_per_thread
block_size = internal.clp2(block_size)
warp_size = 32 if not runtime._is_hip_environment else 64
if block_size < warp_size:
block_size = warp_size
elif block_size > _cub_default_block_size:
block_size = _cub_default_block_size
return items_per_thread, block_size
cdef _scalar.CScalar _cub_convert_to_c_scalar(
Py_ssize_t segment_size, Py_ssize_t value):
if segment_size > 0x7fffffff:
return _scalar.scalar_to_c_scalar(value)
else:
return _scalar.CScalar.from_int32(value)
cdef inline void _cub_two_pass_launch(
str name, Py_ssize_t block_size, Py_ssize_t segment_size,
Py_ssize_t items_per_thread, str reduce_type, tuple params,
list in_args, list out_args,
str identity, str pre_map_expr, str reduce_expr, str post_map_expr,
_kernel._TypeMap type_map, str preamble,
tuple options, stream) except*:
'''
Notes:
1. Two-pass reduction: the first pass distributes an even share over
a number of blocks (with block_size threads), and the second pass
does reduction over 1 block of threads
'''
cdef list out_args_2nd_pass = [out_args[0]]
cdef Py_ssize_t contiguous_size, out_block_num
cdef function.Function func
cdef memory.MemoryPointer memptr
cdef str post_map_expr1, post_map_expr2, f
cdef list inout_args
cdef tuple cub_params
cdef size_t gridx, blockx
cdef _ndarray_base in_arr
# fair share
contiguous_size = min(segment_size, block_size * items_per_thread)
out_block_num = (segment_size + contiguous_size - 1) // contiguous_size
assert out_block_num <= 0x7fffffff
# Because we can't know sizeof(reduce_type) in advance, here we
# conservatively assume it's 32 bytes and allocate a work area
memptr = memory.alloc(out_block_num * 32)
out_args[0] = memptr
# ************************ 1st pass ************************
name += '_pass1'
inout_args = [in_args[0], out_args[0],
_cub_convert_to_c_scalar(segment_size, contiguous_size),
_cub_convert_to_c_scalar(segment_size, segment_size)]
cub_params = (items_per_thread,)
if 'mean' in name:
post_map_expr1 = post_map_expr.replace('_in_ind.size()', '1.0')
post_map_expr1 = post_map_expr1.replace('_out_ind.size()', '1.0')
elif any((f in name for f in ('argmax', 'argmin'))):
# Workaround: in NumPy the indices are always generated based on
# a C-order array (since PyArray_ContiguousFromAny was called).
# We have to do a conversion here (?) since we do not retain the
# info on strides.
# TODO(leofang): improve this workaround
in_arr = in_args[0]
if in_arr.ndim > 1 and in_arr._f_contiguous:
in_arr = _internal_ascontiguousarray(in_arr)
inout_args[0] = in_args[0] = in_arr
post_map_expr1 = post_map_expr
else:
post_map_expr1 = post_map_expr
# Retrieve the kernel function
func = _SimpleCubReductionKernel_get_cached_function(
pre_map_expr, reduce_expr, post_map_expr1, reduce_type,
params,
_kernel._get_arginfos(inout_args),
type_map,
name, block_size, identity, preamble,
('-DFIRST_PASS=1',), cub_params)
# Kernel arguments passed to the __global__ function.
gridx = <size_t>(out_block_num * block_size)
blockx = <size_t>block_size
# Launch the kernel
func.linear_launch(gridx, inout_args, 0, blockx, stream)
# ************************ 2nd pass ************************
name = name[:-1] + '2'
contiguous_size = out_block_num
out_block_num = 1
in_args = out_args
out_args = out_args_2nd_pass
inout_args = [in_args[0], out_args[0],
_cub_convert_to_c_scalar(segment_size, contiguous_size),
_cub_convert_to_c_scalar(segment_size, segment_size)]
# For mean()
if 'mean' in name:
post_map_expr2 = post_map_expr.replace('_in_ind.size()',
'_array_size')
post_map_expr2 = post_map_expr2.replace('_out_ind.size()', '1.0')
else:
post_map_expr2 = post_map_expr
# Retrieve the kernel function
func = _SimpleCubReductionKernel_get_cached_function(
'in0', reduce_expr, post_map_expr2, reduce_type,
params,
_kernel._get_arginfos(inout_args),
type_map,
name, block_size, identity, preamble,
('-DSECOND_PASS=1',), cub_params)
# Kernel arguments passed to the __global__ function.
gridx = <size_t>(out_block_num * block_size)
blockx = <size_t>block_size
# Launch the kernel
func.linear_launch(gridx, inout_args, 0, blockx, stream)
cdef inline void _launch_cub(
self, out_block_num, block_size, block_stride,
in_args, out_args, in_shape, out_shape, type_map,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params, cub_params) except *:
cdef bint full_reduction
cdef Py_ssize_t contiguous_size, items_per_thread
cdef function.Function func
# Kernel arguments passed to the __global__ function.
items_per_thread = cub_params[0]
contiguous_size = cub_params[1]
full_reduction = cub_params[2]
if full_reduction:
_cub_two_pass_launch(
self.name, block_size, contiguous_size, items_per_thread,
reduce_type, params, in_args, out_args, self.identity,
map_expr, reduce_expr, post_map_expr,
type_map, self.preamble, (), stream)
return
else:
inout_args = (
in_args + out_args +
[_cub_convert_to_c_scalar(
contiguous_size, contiguous_size),
_cub_convert_to_c_scalar(
contiguous_size, 0)])
arginfos = _kernel._get_arginfos(inout_args)
func = _SimpleCubReductionKernel_get_cached_function(
map_expr, reduce_expr, post_map_expr, reduce_type,
params, arginfos, type_map,
self.name, block_size, self.identity, self.preamble,
(), cub_params)
func.linear_launch(
out_block_num * block_size, inout_args, 0, block_size, stream)
def _get_cub_optimized_params(
self, optimize_config, in_args, out_args, in_shape, out_shape,
type_map, map_expr, reduce_expr, post_map_expr, reduce_type,
stream, full_reduction, out_block_num, contiguous_size, params):
in_args = [_reduction._optimizer_copy_arg(a) for a in in_args]
out_args = [_reduction._optimizer_copy_arg(a) for a in out_args]
items_per_thread, block_size = (
_get_cub_block_specs(contiguous_size))
default_block_size_log = math.floor(math.log2(block_size))
default_items_per_thread = items_per_thread
def target_func(block_size, items_per_thread):
block_stride = block_size * items_per_thread
cub_params = (
items_per_thread, contiguous_size, full_reduction)
_launch_cub(
self,
out_block_num, block_size, block_stride, in_args, out_args,
in_shape, out_shape, type_map, map_expr, reduce_expr,
post_map_expr, reduce_type, stream, params, cub_params)
def suggest_func(trial):
block_size_log = trial.suggest_int('block_size_log', 5, 10)
block_size = 2 ** block_size_log
items_per_thread = trial.suggest_int(
'items_per_thread', 2, 32, step=2)
trial.set_user_attr('block_size', block_size)
return block_size, items_per_thread
# CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is a possible error
optimize_impl = optimize_config.optimize_impl
best = optimize_impl(
optimize_config, target_func, suggest_func,
default_best={
'block_size_log': default_block_size_log,
'items_per_thread': default_items_per_thread,
}, ignore_error=(driver.CUDADriverError,))
return best.params['items_per_thread'], best.user_attrs['block_size']
cdef bint _try_to_call_cub_reduction(
self, list in_args, list out_args, const shape_t& a_shape,
stream, optimize_context, tuple key,
map_expr, reduce_expr, post_map_expr,
reduce_type, _kernel._TypeMap type_map,
tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
_ndarray_base ret) except *:
"""Try to use cub.
Updates `ret` and returns a boolean value whether cub is used.
Note: input_expr and output_expr are not used in CUB kernels.
"""
cdef tuple axis_permutes
cdef tuple params, opt_params
cdef shape_t in_shape
cdef Py_ssize_t i
cdef Py_ssize_t contiguous_size = -1
cdef Py_ssize_t block_size, block_stride, out_block_num = 0
# decide to use CUB or not
can_use_cub = _can_use_cub_block_reduction(
in_args, out_args, reduce_axis, out_axis)
if can_use_cub is None:
return False
axis_permutes, contiguous_size, full_reduction = can_use_cub
in_shape = _reduction._set_permuted_args(
in_args, axis_permutes, a_shape, self.in_params)
if in_args[0]._f_contiguous:
ret._set_contiguous_strides(ret.dtype.itemsize, False)
out_args[0] = ret
if not full_reduction: # just need one pass
out_block_num = 1 # = number of segments
for i in out_axis:
out_block_num *= in_shape[i]
if 'mean' in self.name:
post_map_expr = post_map_expr.replace(
'_in_ind.size()', '_segment_size')
post_map_expr = post_map_expr.replace(
'_out_ind.size()', '1.0')
if contiguous_size > 0x7fffffff: # INT_MAX
size_type = 'uint64'
else:
size_type = 'int32'
type_map = _kernel._TypeMap(type_map._pairs + (('sizeT', size_type),))
params = (self._params[0:2]
+ _get_param_info(size_type + ' _segment_size', True)
+ _get_param_info(size_type + ' _array_size', True))
# HACK for ReductionKernel:
# 1. input/output arguments might not be named as in0/out0
# 2. pre-/post- maps might not contain in0/out0
# 3. type_map does not contain the expected names (type_in0_raw and
# type_out0_raw)
cdef str old_in0 = params[0].name, old_out0 = params[1].name
if old_in0 != 'in0' or old_out0 != 'out0':
# avoid overwriting self's attributes
params = (_get_param_info('T in0', True)
+ _get_param_info('T out0', False)
+ params[2:])
map_expr = map_expr.replace(old_in0, 'in0')
post_map_expr = post_map_expr.replace(old_out0, 'out0')
type_map = _kernel._TypeMap(type_map._pairs + (
('type_in0_raw', in_args[0].dtype.type),
('type_out0_raw', out_args[0].dtype.type),
))
# Calculate the reduction block dimensions.
optimize_context = _optimize_config.get_current_context()
if optimize_context is None:
# Calculate manually
items_per_thread, block_size = _get_cub_block_specs(contiguous_size)
else:
# Optimize dynamically
key = ('cub_reduction',) + key
opt_params = optimize_context.get_params(key)
if opt_params is None:
opt_params = _get_cub_optimized_params(
self,
optimize_context.config, in_args, out_args,
in_shape, out_shape, type_map, map_expr, reduce_expr,
post_map_expr, reduce_type, stream,
full_reduction, out_block_num, contiguous_size, params)
optimize_context.set_params(key, opt_params)
items_per_thread, block_size = opt_params
block_stride = block_size * items_per_thread
cub_params = (items_per_thread, contiguous_size, full_reduction)
_launch_cub(
self,
out_block_num,
block_size,
block_stride,
in_args, out_args,
in_shape, out_shape,
type_map,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params, cub_params)
return True
cpdef get_dtype(t)
cpdef tuple get_dtype_with_itemsize(t)
cpdef int to_cuda_dtype(dtype, bint is_half_allowed=*) except -1
cpdef void _raise_if_invalid_cast(
from_dt,
to_dt,
str casting,
argname=*
) except *
cimport cython # NOQA
import numpy
import warnings
from cupy_backends.cuda.api cimport runtime
all_type_chars = '?bhilqBHILQefdFD'
# for c in '?bhilqBHILQefdFD':
# print('#', c, '...', np.dtype(c).name)
# ? ... bool
# b ... int8
# h ... int16
# i ... int32
# l ... int64 (int32 in windows)
# q ... int64
# B ... uint8
# H ... uint16
# I ... uint32
# L ... uint64 (uint32 in windows)
# Q ... uint64
# e ... float16
# f ... float32
# d ... float64
# F ... complex64
# D ... complex128
cdef dict _dtype_dict = {}
cdef _dtype = numpy.dtype
cdef _init_dtype_dict():
for i in (int, float, bool, complex, None):
dtype = _dtype(i)
_dtype_dict[i] = (dtype, dtype.itemsize)
for i in all_type_chars:
dtype = _dtype(i)
item = (dtype, dtype.itemsize)
_dtype_dict[i] = item
_dtype_dict[dtype.type] = item
for i in {str(_dtype(i)) for i in all_type_chars}:
dtype = _dtype(i)
_dtype_dict[i] = (dtype, dtype.itemsize)
_init_dtype_dict()
@cython.profile(False)
cpdef get_dtype(t):
ret = _dtype_dict.get(t, None)
if ret is None:
return _dtype(t)
return ret[0]
@cython.profile(False)
cpdef tuple get_dtype_with_itemsize(t):
ret = _dtype_dict.get(t, None)
if ret is None:
t = _dtype(t)
return t, t.itemsize
return ret
cpdef int to_cuda_dtype(dtype, bint is_half_allowed=False) except -1:
cdef str dtype_char
try:
dtype_char = dtype.char
except AttributeError:
dtype_char = dtype
if dtype_char == 'e' and is_half_allowed:
return runtime.CUDA_R_16F
elif dtype_char == 'f':
return runtime.CUDA_R_32F
elif dtype_char == 'd':
return runtime.CUDA_R_64F
elif dtype_char == 'F':
return runtime.CUDA_C_32F
elif dtype_char == 'D':
return runtime.CUDA_C_64F
elif dtype_char == 'E' and is_half_allowed:
# complex32, not supported in NumPy
return runtime.CUDA_C_16F
else:
raise TypeError('dtype is not supported: {}'.format(dtype))
cdef _numpy_can_cast = numpy.can_cast
cpdef void _raise_if_invalid_cast(
from_dt, to_dt, str casting, argname="array data"
) except *:
"""Raise an error if a cast is not valid. Also checks whether the cast
goes from complex to real and warns if it does.
The error raised can be customized by giving `obj`. May pass a (lambda)
function to avoid string construction on success.
This function exists mainly to build a similar error everywhere.
"""
if from_dt is to_dt:
return
to_dt = get_dtype(to_dt) # may still be a type not a dtype instance
if casting == "same_kind" and from_dt.kind == to_dt.kind:
# same-kind is the most common casting used and for NumPy dtypes.
return
if _numpy_can_cast(from_dt, to_dt, casting):
if casting == "unsafe" and from_dt.kind == "c" and to_dt.kind in "iuf":
# Complex warning, we are dropping the imagine part:
warnings.warn(
'Casting complex values to real discards the imaginary part',
numpy.ComplexWarning)
return
# Casting is not possible, raise the error
if not isinstance(argname, str):
argname = argname()
raise TypeError(
f'Cannot cast {argname} from {from_dt!r} to {to_dt!r} '
f'according to the rule \'{casting}\'')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment