devicearray.py

"""
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object.  If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
"""

import math
import functools
import operator
import copy
from ctypes import c_void_p

import numpy as np

import numba
from numba import _devicearray
from numba.cuda.cudadrv import devices
from numba.cuda.cudadrv import driver as _driver
from numba.core import types, config
from numba.np.unsafe.ndarray import to_fixed_tuple
from numba.misc import dummyarray
from numba.np import numpy_support
from numba.cuda.api_util import prepare_shape_strides_dtype
from numba.core.errors import NumbaPerformanceWarning
from warnings import warn

try:
    lru_cache = getattr(functools, 'lru_cache')(None)
except AttributeError:
    # Python 3.1 or lower
    def lru_cache(func):
        return func


def is_cuda_ndarray(obj):
    "Check if an object is a CUDA ndarray"
    return getattr(obj, '__cuda_ndarray__', False)


def verify_cuda_ndarray_interface(obj):
    "Verify the CUDA ndarray interface for an obj"
    require_cuda_ndarray(obj)

    def requires_attr(attr, typ):
        if not hasattr(obj, attr):
            raise AttributeError(attr)
        if not isinstance(getattr(obj, attr), typ):
            raise AttributeError('%s must be of type %s' % (attr, typ))

    requires_attr('shape', tuple)
    requires_attr('strides', tuple)
    requires_attr('dtype', np.dtype)
    requires_attr('size', int)


def require_cuda_ndarray(obj):
    "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
    if not is_cuda_ndarray(obj):
        raise ValueError('require an cuda ndarray object')


class DeviceNDArrayBase(_devicearray.DeviceArray):
    """A on GPU NDArray representation
    """
    __cuda_memory__ = True
    __cuda_ndarray__ = True     # There must be gpu_data attribute

    def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
        """
        Args
        ----

        shape
            array shape.
        strides
            array strides.
        dtype
            data type as np.dtype coercible object.
        stream
            cuda stream.
        gpu_data
            user provided device memory for the ndarray data buffer
        """
        if isinstance(shape, int):
            shape = (shape,)
        if isinstance(strides, int):
            strides = (strides,)
        dtype = np.dtype(dtype)
        self.ndim = len(shape)
        if len(strides) != self.ndim:
            raise ValueError('strides not match ndim')
        self._dummy = dummyarray.Array.from_desc(0, shape, strides,
                                                 dtype.itemsize)
        self.shape = tuple(shape)
        self.strides = tuple(strides)
        self.dtype = dtype
        self.size = int(functools.reduce(operator.mul, self.shape, 1))
        # prepare gpu memory
        if self.size > 0:
            if gpu_data is None:
                self.alloc_size = _driver.memory_size_from_info(
                    self.shape, self.strides, self.dtype.itemsize)
                gpu_data = devices.get_context().memalloc(self.alloc_size)
            else:
                self.alloc_size = _driver.device_memory_size(gpu_data)
        else:
            # Make NULL pointer for empty allocation
            if _driver.USE_NV_BINDING:
                null = _driver.binding.CUdeviceptr(0)
            else:
                null = c_void_p(0)
            gpu_data = _driver.MemoryPointer(context=devices.get_context(),
                                             pointer=null, size=0)
            self.alloc_size = 0

        self.gpu_data = gpu_data
        self.stream = stream

    @property
    def __cuda_array_interface__(self):
        if _driver.USE_NV_BINDING:
            if self.device_ctypes_pointer is not None:
                ptr = int(self.device_ctypes_pointer)
            else:
                ptr = 0
        else:
            if self.device_ctypes_pointer.value is not None:
                ptr = self.device_ctypes_pointer.value
            else:
                ptr = 0

        return {
            'shape': tuple(self.shape),
            'strides': None if is_contiguous(self) else tuple(self.strides),
            'data': (ptr, False),
            'typestr': self.dtype.str,
            'stream': int(self.stream) if self.stream != 0 else None,
            'version': 3,
        }

    def bind(self, stream=0):
        """Bind a CUDA stream to this object so that all subsequent operation
        on this array defaults to the given stream.
        """
        clone = copy.copy(self)
        clone.stream = stream
        return clone

    @property
    def T(self):
        return self.transpose()

    def transpose(self, axes=None):
        if axes and tuple(axes) == tuple(range(self.ndim)):
            return self
        elif self.ndim != 2:
            msg = "transposing a non-2D DeviceNDArray isn't supported"
            raise NotImplementedError(msg)
        elif axes is not None and set(axes) != set(range(self.ndim)):
            raise ValueError("invalid axes list %r" % (axes,))
        else:
            from numba.cuda.kernels.transpose import transpose
            return transpose(self)

    def _default_stream(self, stream):
        return self.stream if not stream else stream

    @property
    def _numba_type_(self):
        """
        Magic attribute expected by Numba to get the numba type that
        represents this object.
        """
        # Typing considerations:
        #
        # 1. The preference is to use 'C' or 'F' layout since this enables
        # hardcoding stride values into compiled kernels, which is more
        # efficient than storing a passed-in value in a register.
        #
        # 2. If an array is both C- and F-contiguous, prefer 'C' layout as it's
        # the more likely / common case.
        #
        # 3. If an array is broadcast then it must be typed as 'A' - using 'C'
        # or 'F' does not apply for broadcast arrays, because the strides, some
        # of which will be 0, will not match those hardcoded in for 'C' or 'F'
        # layouts.

        broadcast = 0 in self.strides
        if self.flags['C_CONTIGUOUS'] and not broadcast:
            layout = 'C'
        elif self.flags['F_CONTIGUOUS'] and not broadcast:
            layout = 'F'
        else:
            layout = 'A'

        dtype = numpy_support.from_dtype(self.dtype)
        return types.Array(dtype, self.ndim, layout)

    @property
    def device_ctypes_pointer(self):
        """Returns the ctypes pointer to the GPU data buffer
        """
        if self.gpu_data is None:
            if _driver.USE_NV_BINDING:
                return _driver.binding.CUdeviceptr(0)
            else:
                return c_void_p(0)
        else:
            return self.gpu_data.device_ctypes_pointer

    @devices.require_context
    def copy_to_device(self, ary, stream=0):
        """Copy `ary` to `self`.

        If `ary` is a CUDA memory, perform a device-to-device transfer.
        Otherwise, perform a a host-to-device transfer.
        """
        if ary.size == 0:
            # Nothing to do
            return

        sentry_contiguous(self)
        stream = self._default_stream(stream)

        self_core, ary_core = array_core(self), array_core(ary)
        if _driver.is_device_memory(ary):
            sentry_contiguous(ary)
            check_array_compatibility(self_core, ary_core)
            _driver.device_to_device(self, ary, self.alloc_size, stream=stream)
        else:
            # Ensure same contiguity. Only makes a host-side copy if necessary
            # (i.e., in order to materialize a writable strided view)
            ary_core = np.array(
                ary_core,
                order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
                subok=True,
                copy=not ary_core.flags['WRITEABLE'])
            check_array_compatibility(self_core, ary_core)
            _driver.host_to_device(self, ary_core, self.alloc_size,
                                   stream=stream)

    @devices.require_context
    def copy_to_host(self, ary=None, stream=0):
        """Copy ``self`` to ``ary`` or create a new Numpy ndarray
        if ``ary`` is ``None``.

        If a CUDA ``stream`` is given, then the transfer will be made
        asynchronously as part as the given stream.  Otherwise, the transfer is
        synchronous: the function returns after the copy is finished.

        Always returns the host array.

        Example::

            import numpy as np
            from numba import cuda

            arr = np.arange(1000)
            d_arr = cuda.to_device(arr)

            my_kernel[100, 100](d_arr)

            result_array = d_arr.copy_to_host()
        """
        if any(s < 0 for s in self.strides):
            msg = 'D->H copy not implemented for negative strides: {}'
            raise NotImplementedError(msg.format(self.strides))
        assert self.alloc_size >= 0, "Negative memory size"
        stream = self._default_stream(stream)
        if ary is None:
            hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
        else:
            check_array_compatibility(self, ary)
            hostary = ary

        if self.alloc_size != 0:
            _driver.device_to_host(hostary, self, self.alloc_size,
                                   stream=stream)

        if ary is None:
            if self.size == 0:
                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
                                     buffer=hostary)
            else:
                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
                                     strides=self.strides, buffer=hostary)
        return hostary

    def split(self, section, stream=0):
        """Split the array into equal partition of the `section` size.
        If the array cannot be equally divided, the last section will be
        smaller.
        """
        stream = self._default_stream(stream)
        if self.ndim != 1:
            raise ValueError("only support 1d array")
        if self.strides[0] != self.dtype.itemsize:
            raise ValueError("only support unit stride")
        nsect = int(math.ceil(float(self.size) / section))
        strides = self.strides
        itemsize = self.dtype.itemsize
        for i in range(nsect):
            begin = i * section
            end = min(begin + section, self.size)
            shape = (end - begin,)
            gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
            yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
                                gpu_data=gpu_data)

    def as_cuda_arg(self):
        """Returns a device memory object that is used as the argument.
        """
        return self.gpu_data

    def get_ipc_handle(self):
        """
        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
        to another process to share the local allocation.

        Note: this feature is only available on Linux.
        """
        ipch = devices.get_context().get_ipc_handle(self.gpu_data)
        desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype)
        return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)

    def squeeze(self, axis=None, stream=0):
        """
        Remove axes of size one from the array shape.

        Parameters
        ----------
        axis : None or int or tuple of ints, optional
            Subset of dimensions to remove. A `ValueError` is raised if an axis
            with size greater than one is selected. If `None`, all axes with
            size one are removed.
        stream : cuda stream or 0, optional
            Default stream for the returned view of the array.

        Returns
        -------
        DeviceNDArray
            Squeezed view into the array.

        """
        new_dummy, _ = self._dummy.squeeze(axis=axis)
        return DeviceNDArray(
            shape=new_dummy.shape,
            strides=new_dummy.strides,
            dtype=self.dtype,
            stream=self._default_stream(stream),
            gpu_data=self.gpu_data,
        )

    def view(self, dtype):
        """Returns a new object by reinterpretting the dtype without making a
        copy of the data.
        """
        dtype = np.dtype(dtype)
        shape = list(self.shape)
        strides = list(self.strides)

        if self.dtype.itemsize != dtype.itemsize:
            if not self.is_c_contiguous():
                raise ValueError(
                    "To change to a dtype of a different size,"
                    " the array must be C-contiguous"
                )

            shape[-1], rem = divmod(
                shape[-1] * self.dtype.itemsize,
                dtype.itemsize
            )

            if rem != 0:
                raise ValueError(
                    "When changing to a larger dtype,"
                    " its size must be a divisor of the total size in bytes"
                    " of the last axis of the array."
                )

            strides[-1] = dtype.itemsize

        return DeviceNDArray(
            shape=shape,
            strides=strides,
            dtype=dtype,
            stream=self.stream,
            gpu_data=self.gpu_data,
        )

    @property
    def nbytes(self):
        # Note: not using `alloc_size`.  `alloc_size` reports memory
        # consumption of the allocation, not the size of the array
        # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.nbytes.html
        return self.dtype.itemsize * self.size


class DeviceRecord(DeviceNDArrayBase):
    '''
    An on-GPU record type
    '''
    def __init__(self, dtype, stream=0, gpu_data=None):
        shape = ()
        strides = ()
        super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
                                           gpu_data)

    @property
    def flags(self):
        """
        For `numpy.ndarray` compatibility. Ideally this would return a
        `np.core.multiarray.flagsobj`, but that needs to be constructed
        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
        aren't writeable).
        """
        return dict(self._dummy.flags) # defensive copy

    @property
    def _numba_type_(self):
        """
        Magic attribute expected by Numba to get the numba type that
        represents this object.
        """
        return numpy_support.from_dtype(self.dtype)

    @devices.require_context
    def __getitem__(self, item):
        return self._do_getitem(item)

    @devices.require_context
    def getitem(self, item, stream=0):
        """Do `__getitem__(item)` with CUDA stream
        """
        return self._do_getitem(item, stream)

    def _do_getitem(self, item, stream=0):
        stream = self._default_stream(stream)
        typ, offset = self.dtype.fields[item]
        newdata = self.gpu_data.view(offset)

        if typ.shape == ():
            if typ.names is not None:
                return DeviceRecord(dtype=typ, stream=stream,
                                    gpu_data=newdata)
            else:
                hostary = np.empty(1, dtype=typ)
                _driver.device_to_host(dst=hostary, src=newdata,
                                       size=typ.itemsize,
                                       stream=stream)
            return hostary[0]
        else:
            shape, strides, dtype = \
                prepare_shape_strides_dtype(typ.shape,
                                            None,
                                            typ.subdtype[0], 'C')
            return DeviceNDArray(shape=shape, strides=strides,
                                 dtype=dtype, gpu_data=newdata,
                                 stream=stream)

    @devices.require_context
    def __setitem__(self, key, value):
        return self._do_setitem(key, value)

    @devices.require_context
    def setitem(self, key, value, stream=0):
        """Do `__setitem__(key, value)` with CUDA stream
        """
        return self._do_setitem(key, value, stream=stream)

    def _do_setitem(self, key, value, stream=0):

        stream = self._default_stream(stream)

        # If the record didn't have a default stream, and the user didn't
        # provide a stream, then we will use the default stream for the
        # assignment kernel and synchronize on it.
        synchronous = not stream
        if synchronous:
            ctx = devices.get_context()
            stream = ctx.get_default_stream()

        # (1) prepare LHS

        typ, offset = self.dtype.fields[key]
        newdata = self.gpu_data.view(offset)

        lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata)

        # (2) prepare RHS

        rhs, _ = auto_device(lhs.dtype.type(value), stream=stream)

        # (3) do the copy

        _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream)

        if synchronous:
            stream.synchronize()


@lru_cache
def _assign_kernel(ndim):
    """
    A separate method so we don't need to compile code every assignment (!).

    :param ndim: We need to have static array sizes for cuda.local.array, so
        bake in the number of dimensions into the kernel
    """
    from numba import cuda  # circular!

    if ndim == 0:
        # the (2, ndim) allocation below is not yet supported, so avoid it
        @cuda.jit
        def kernel(lhs, rhs):
            lhs[()] = rhs[()]
        return kernel

    @cuda.jit
    def kernel(lhs, rhs):
        location = cuda.grid(1)

        n_elements = 1
        for i in range(lhs.ndim):
            n_elements *= lhs.shape[i]
        if location >= n_elements:
            # bake n_elements into the kernel, better than passing it in
            # as another argument.
            return

        # [0, :] is the to-index (into `lhs`)
        # [1, :] is the from-index (into `rhs`)
        idx = cuda.local.array(
            shape=(2, ndim),
            dtype=types.int64)

        for i in range(ndim - 1, -1, -1):
            idx[0, i] = location % lhs.shape[i]
            idx[1, i] = (location % lhs.shape[i]) * (rhs.shape[i] > 1)
            location //= lhs.shape[i]

        lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
    return kernel


class DeviceNDArray(DeviceNDArrayBase):
    '''
    An on-GPU array type
    '''
    def is_f_contiguous(self):
        '''
        Return true if the array is Fortran-contiguous.
        '''
        return self._dummy.is_f_contig

    @property
    def flags(self):
        """
        For `numpy.ndarray` compatibility. Ideally this would return a
        `np.core.multiarray.flagsobj`, but that needs to be constructed
        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
        aren't writeable).
        """
        return dict(self._dummy.flags) # defensive copy

    def is_c_contiguous(self):
        '''
        Return true if the array is C-contiguous.
        '''
        return self._dummy.is_c_contig

    def __array__(self, dtype=None):
        """
        :return: an `numpy.ndarray`, so copies to the host.
        """
        if dtype:
            return self.copy_to_host().__array__(dtype)
        else:
            return self.copy_to_host().__array__()

    def __len__(self):
        return self.shape[0]

    def reshape(self, *newshape, **kws):
        """
        Reshape the array without changing its contents, similarly to
        :meth:`numpy.ndarray.reshape`. Example::

            d_arr = d_arr.reshape(20, 50, order='F')
        """
        if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
            newshape = newshape[0]

        cls = type(self)
        if newshape == self.shape:
            # nothing to do
            return cls(shape=self.shape, strides=self.strides,
                       dtype=self.dtype, gpu_data=self.gpu_data)

        newarr, extents = self._dummy.reshape(*newshape, **kws)

        if extents == [self._dummy.extent]:
            return cls(shape=newarr.shape, strides=newarr.strides,
                       dtype=self.dtype, gpu_data=self.gpu_data)
        else:
            raise NotImplementedError("operation requires copying")

    def ravel(self, order='C', stream=0):
        '''
        Flattens a contiguous array without changing its contents, similar to
        :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
        exception.
        '''
        stream = self._default_stream(stream)
        cls = type(self)
        newarr, extents = self._dummy.ravel(order=order)

        if extents == [self._dummy.extent]:
            return cls(shape=newarr.shape, strides=newarr.strides,
                       dtype=self.dtype, gpu_data=self.gpu_data,
                       stream=stream)

        else:
            raise NotImplementedError("operation requires copying")

    @devices.require_context
    def __getitem__(self, item):
        return self._do_getitem(item)

    @devices.require_context
    def getitem(self, item, stream=0):
        """Do `__getitem__(item)` with CUDA stream
        """
        return self._do_getitem(item, stream)

    def _do_getitem(self, item, stream=0):
        stream = self._default_stream(stream)

        arr = self._dummy.__getitem__(item)
        extents = list(arr.iter_contiguous_extent())
        cls = type(self)
        if len(extents) == 1:
            newdata = self.gpu_data.view(*extents[0])

            if not arr.is_array:
                # Check for structured array type (record)
                if self.dtype.names is not None:
                    return DeviceRecord(dtype=self.dtype, stream=stream,
                                        gpu_data=newdata)
                else:
                    # Element indexing
                    hostary = np.empty(1, dtype=self.dtype)
                    _driver.device_to_host(dst=hostary, src=newdata,
                                           size=self._dummy.itemsize,
                                           stream=stream)
                return hostary[0]
            else:
                return cls(shape=arr.shape, strides=arr.strides,
                           dtype=self.dtype, gpu_data=newdata, stream=stream)
        else:
            newdata = self.gpu_data.view(*arr.extent)
            return cls(shape=arr.shape, strides=arr.strides,
                       dtype=self.dtype, gpu_data=newdata, stream=stream)

    @devices.require_context
    def __setitem__(self, key, value):
        return self._do_setitem(key, value)

    @devices.require_context
    def setitem(self, key, value, stream=0):
        """Do `__setitem__(key, value)` with CUDA stream
        """
        return self._do_setitem(key, value, stream=stream)

    def _do_setitem(self, key, value, stream=0):

        stream = self._default_stream(stream)

        # If the array didn't have a default stream, and the user didn't provide
        # a stream, then we will use the default stream for the assignment
        # kernel and synchronize on it.
        synchronous = not stream
        if synchronous:
            ctx = devices.get_context()
            stream = ctx.get_default_stream()

        # (1) prepare LHS

        arr = self._dummy.__getitem__(key)
        newdata = self.gpu_data.view(*arr.extent)

        if isinstance(arr, dummyarray.Element):
            # convert to a 0d array
            shape = ()
            strides = ()
        else:
            shape = arr.shape
            strides = arr.strides

        lhs = type(self)(
            shape=shape,
            strides=strides,
            dtype=self.dtype,
            gpu_data=newdata,
            stream=stream)

        # (2) prepare RHS

        rhs, _ = auto_device(value, stream=stream, user_explicit=True)
        if rhs.ndim > lhs.ndim:
            raise ValueError("Can't assign %s-D array to %s-D self" % (
                rhs.ndim,
                lhs.ndim))
        rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
        # negative indices would not work if rhs.ndim == 0
        rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
        rhs = rhs.reshape(*rhs_shape)
        for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
            if r != 1 and l != r:
                raise ValueError("Can't copy sequence with size %d to array "
                                 "axis %d with dimension %d" % ( r, i, l))

        # (3) do the copy

        n_elements = functools.reduce(operator.mul, lhs.shape, 1)
        _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
        if synchronous:
            stream.synchronize()


class IpcArrayHandle(object):
    """
    An IPC array handle that can be serialized and transfer to another process
    in the same machine for share a GPU allocation.

    On the destination process, use the *.open()* method to creates a new
    *DeviceNDArray* object that shares the allocation from the original process.
    To release the resources, call the *.close()* method.  After that, the
    destination can no longer use the shared array object.  (Note: the
    underlying weakref to the resource is now dead.)

    This object implements the context-manager interface that calls the
    *.open()* and *.close()* method automatically::

        with the_ipc_array_handle as ipc_array:
            # use ipc_array here as a normal gpu array object
            some_code(ipc_array)
        # ipc_array is dead at this point
    """
    def __init__(self, ipc_handle, array_desc):
        self._array_desc = array_desc
        self._ipc_handle = ipc_handle

    def open(self):
        """
        Returns a new *DeviceNDArray* that shares the allocation from the
        original process.  Must not be used on the original process.
        """
        dptr = self._ipc_handle.open(devices.get_context())
        return DeviceNDArray(gpu_data=dptr, **self._array_desc)

    def close(self):
        """
        Closes the IPC handle to the array.
        """
        self._ipc_handle.close()

    def __enter__(self):
        return self.open()

    def __exit__(self, type, value, traceback):
        self.close()


class MappedNDArray(DeviceNDArrayBase, np.ndarray):
    """
    A host array that uses CUDA mapped memory.
    """

    def device_setup(self, gpu_data, stream=0):
        self.gpu_data = gpu_data
        self.stream = stream


class ManagedNDArray(DeviceNDArrayBase, np.ndarray):
    """
    A host array that uses CUDA managed memory.
    """

    def device_setup(self, gpu_data, stream=0):
        self.gpu_data = gpu_data
        self.stream = stream


def from_array_like(ary, stream=0, gpu_data=None):
    "Create a DeviceNDArray object that is like ary."
    return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
                         gpu_data=gpu_data)


def from_record_like(rec, stream=0, gpu_data=None):
    "Create a DeviceRecord object that is like rec."
    return DeviceRecord(rec.dtype, stream=stream, gpu_data=gpu_data)


def array_core(ary):
    """
    Extract the repeated core of a broadcast array.

    Broadcast arrays are by definition non-contiguous due to repeated
    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
    contiguity and copy the underlying data from such arrays, we must create
    a view without the repeated dimensions.

    """
    if not ary.strides or not ary.size:
        return ary
    core_index = []
    for stride in ary.strides:
        core_index.append(0 if stride == 0 else slice(None))
    return ary[tuple(core_index)]


def is_contiguous(ary):
    """
    Returns True iff `ary` is C-style contiguous while ignoring
    broadcasted and 1-sized dimensions.
    As opposed to array_core(), it does not call require_context(),
    which can be quite expensive.
    """
    size = ary.dtype.itemsize
    for shape, stride in zip(reversed(ary.shape), reversed(ary.strides)):
        if shape > 1 and stride != 0:
            if size != stride:
                return False
            size *= shape
    return True


errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
                            "be transferred as a single memory region. Please "
                            "ensure contiguous buffer with numpy "
                            ".ascontiguousarray()")


def sentry_contiguous(ary):
    core = array_core(ary)
    if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
        raise ValueError(errmsg_contiguous_buffer)


def auto_device(obj, stream=0, copy=True, user_explicit=False):
    """
    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
    host to device. If obj already represents device memory, it is returned and
    no copy is made.
    """
    if _driver.is_device_memory(obj):
        return obj, False
    elif hasattr(obj, '__cuda_array_interface__'):
        return numba.cuda.as_cuda_array(obj), False
    else:
        if isinstance(obj, np.void):
            devobj = from_record_like(obj, stream=stream)
        else:
            # This allows you to pass non-array objects like constants and
            # objects implementing the array interface
            # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html
            # into this function (with no overhead -- copies -- for `obj`s
            # that are already `ndarray`s.
            obj = np.array(
                obj,
                copy=False,
                subok=True)
            sentry_contiguous(obj)
            devobj = from_array_like(obj, stream=stream)
        if copy:
            if config.CUDA_WARN_ON_IMPLICIT_COPY:
                if (
                    not user_explicit and
                    (not isinstance(obj, DeviceNDArray)
                     and isinstance(obj, np.ndarray))
                ):
                    msg = ("Host array used in CUDA kernel will incur "
                           "copy overhead to/from device.")
                    warn(NumbaPerformanceWarning(msg))
            devobj.copy_to_device(obj, stream=stream)
        return devobj, True


def check_array_compatibility(ary1, ary2):
    ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
    if ary1.dtype != ary2.dtype:
        raise TypeError('incompatible dtype: %s vs. %s' %
                        (ary1.dtype, ary2.dtype))
    if ary1sq.shape != ary2sq.shape:
        raise ValueError('incompatible shape: %s vs. %s' %
                         (ary1.shape, ary2.shape))
    # We check strides only if the size is nonzero, because strides are
    # irrelevant (and can differ) for zero-length copies.
    if ary1.size and ary1sq.strides != ary2sq.strides:
        raise ValueError('incompatible strides: %s vs. %s' %
                         (ary1.strides, ary2.strides))