update roc

5c70ef66 · dugupeiwen · 1fb0017a · 5c70ef66 · 5c70ef66 · 5c70ef66
Commit 5c70ef66 authored Mar 23, 2024 by dugupeiwen
20 changed files
--- a/numba/roc/hsadrv/devicearray.py
+++ b/numba/roc/hsadrv/devicearray.py
+"""
+A HSA dGPU backed ND Array is recognized by checking the __hsa_memory__
+attribute on the object.  If it exists and evaluate to True, it must define
+shape, strides, dtype and size attributes similar to a NumPy ndarray.
+"""
+import warnings
+import math
+import copy
+import weakref
+from ctypes import c_void_p
+import numpy as np
+from numba.roc.hsadrv import driver as _driver
+from numba.roc.hsadrv import devices
+from numba.core import types
+from .error import HsaContextMismatchError
+from numba.misc import dummyarray
+from numba.np import numpy_support
+
+
+def is_hsa_ndarray(obj):
+    "Check if an object is a HSA ndarray"
+    return getattr(obj, '__hsa_ndarray__', False)
+
+
+def verify_hsa_ndarray_interface(obj):
+    "Verify the HSA ndarray interface for an obj"
+    require_hsa_ndarray(obj)
+
+    def requires_attr(attr, typ):
+        if not hasattr(obj, attr):
+            raise AttributeError(attr)
+        if not isinstance(getattr(obj, attr), typ):
+            raise AttributeError('%s must be of type %s' % (attr, typ))
+
+    requires_attr('shape', tuple)
+    requires_attr('strides', tuple)
+    requires_attr('dtype', np.dtype)
+    requires_attr('size', int)
+
+
+def require_hsa_ndarray(obj):
+    "Raises ValueError if is_hsa_ndarray(obj) evaluates False"
+    if not is_hsa_ndarray(obj):
+        raise ValueError('require an hsa ndarray object')
+
+
+class DeviceNDArrayBase(object):
+    """Base class for an on dGPU NDArray representation cf. numpy.ndarray
+    """
+    __hsa_memory__ = True
+    __hsa_ndarray__ = True     # There must be dgpu_data attribute as a result
+
+    def __init__(self, shape, strides, dtype, dgpu_data=None):
+        """
+        Args
+        ----
+
+        shape
+            array shape.
+        strides
+            array strides.
+        dtype
+            data type as numpy.dtype.
+        dgpu_data
+            user provided device memory for the ndarray data buffer
+        """
+        if isinstance(shape, int):
+            shape = (shape,)
+        if isinstance(strides, int):
+            strides = (strides,)
+        self.ndim = len(shape)
+        if len(strides) != self.ndim:
+            raise ValueError('strides not match ndim')
+        self._dummy = dummyarray.Array.from_desc(0, shape, strides,
+                                                 dtype.itemsize)
+        self.shape = tuple(shape)
+        self.strides = tuple(strides)
+        self.dtype = np.dtype(dtype)
+        self.size = int(np.prod(self.shape))
+        # prepare dgpu memory
+        if self.size > 0:
+            if dgpu_data is None:
+                from numba.roc.api import _memory_size_from_info
+                self.alloc_size = _memory_size_from_info(self.shape,
+                                          self.strides, self.dtype.itemsize)
+                # find a coarse region on the dGPU
+                dgpu_data = devices.get_context().mempoolalloc(self.alloc_size)
+            else:  # we have some preallocated dgpu_memory
+                sz = getattr(dgpu_data, '_hsa_memsize_', None)
+                if sz is None:
+                    raise ValueError('dgpu_data as no _hsa_memsize_ attribute')
+                assert sz >= 0
+                self.alloc_size = sz
+        else:
+            dgpu_data = None
+            self.alloc_size = 0
+
+        self.dgpu_data = dgpu_data
+
+    @property
+    def _context(self):
+        return self.dgpu_data.context
+
+    @property
+    def _numba_type_(self):
+        """
+        Magic attribute expected by Numba to get the numba type that
+        represents this object.
+        """
+        dtype = numpy_support.from_dtype(self.dtype)
+        return types.Array(dtype, self.ndim, 'A')
+
+    @property
+    def device_ctypes_pointer(self):
+        """Returns the ctypes pointer to the GPU data buffer
+        """
+        if self.dgpu_data is None:
+            return c_void_p(0)
+        else:
+            return self.dgpu_data.device_ctypes_pointer
+
+    def copy_to_device(self, ary, stream=None, context=None):
+        """Copy `ary` to `self`.
+
+        If `ary` is a HSA memory, perform a device-to-device transfer.
+        Otherwise, perform a a host-to-device transfer.
+
+        If `stream` is a stream object, an async copy to used.
+        """
+        if ary.size == 0:
+            # Nothing to do
+            return
+
+        if context is not None:
+            if self.dgpu_data is not None:
+                expect, got = self._context, context
+                if expect.unproxy != got.unproxy:
+                    raise HsaContextMismatchError(expect=expect, got=got)
+        else:
+            context = self._context
+
+        # TODO: Worry about multiple dGPUs
+        #if _driver.is_device_memory(ary):
+        #    sz = min(self.alloc_size, ary.alloc_size)
+        #    _driver.device_to_device(self, ary, sz)
+        #else:
+        #    sz = min(_driver.host_memory_size(ary), self.alloc_size)
+
+        sz = self.alloc_size
+
+        # host_to_dGPU(context, dst, src, size):
+        if stream is None:
+            _driver.hsa.implicit_sync()
+
+            if isinstance(ary, DeviceNDArray):
+                _driver.dGPU_to_dGPU(self._context, self, ary, sz)
+            else:
+                _driver.host_to_dGPU(self._context, self, ary, sz)
+        else:
+            if isinstance(ary, DeviceNDArray):
+                _driver.async_dGPU_to_dGPU(dst_ctx=self._context,
+                                           src_ctx=ary._context,
+                                           dst=self, src=ary, size=sz,
+                                           stream=stream)
+            else:
+                _driver.async_host_to_dGPU(dst_ctx=self._context,
+                                        src_ctx=devices.get_cpu_context(),
+                                        dst=self, src=ary, size=sz,
+                                        stream=stream)
+
+    def copy_to_host(self, ary=None, stream=None):
+        """Copy ``self`` to ``ary`` or create a new Numpy ndarray
+        if ``ary`` is ``None``.
+
+        The transfer is synchronous: the function returns after the copy
+        is finished.
+
+        Always returns the host array.
+
+        Example::
+
+            import numpy as np
+            from numba import hsa
+
+            arr = np.arange(1000)
+            d_arr = hsa.to_device(arr)
+
+            my_kernel[100, 100](d_arr)
+
+            result_array = d_arr.copy_to_host()
+        """
+        if ary is None:  # destination does not exist
+            hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
+        else: # destination does exist, it's `ary`, check it
+            if ary.dtype != self.dtype:
+                raise TypeError('incompatible dtype')
+
+            if ary.shape != self.shape:
+                scalshapes = (), (1,)
+                if not (ary.shape in scalshapes and self.shape in scalshapes):
+                    raise TypeError('incompatible shape; device %s; host %s' %
+                                    (self.shape, ary.shape))
+            if ary.strides != self.strides:
+                scalstrides = (), (self.dtype.itemsize,)
+                if not (ary.strides in scalstrides and
+                                self.strides in scalstrides):
+                    raise TypeError('incompatible strides; device %s; host %s' %
+                                    (self.strides, ary.strides))
+            hostary = ary  # this is supposed to be a ptr for writing
+
+        # a location for the data exists as `hostary`
+        assert self.alloc_size >= 0, "Negative memory size"
+
+        context = self._context
+
+        # copy the data from the device to the hostary
+        if self.alloc_size != 0:
+            sz = self.alloc_size
+            if stream is None:
+                _driver.hsa.implicit_sync()
+                _driver.dGPU_to_host(context, hostary, self, sz)
+            else:
+                _driver.async_dGPU_to_host(dst_ctx=devices.get_cpu_context(),
+                                           src_ctx=self._context,
+                                           dst=hostary, src=self,
+                                           size=sz, stream=stream)
+
+        # if the location for the data was originally None
+        # then create a new ndarray and plumb in the new memory
+        if ary is None:
+            if self.size == 0:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     buffer=hostary)
+            else:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     strides=self.strides, buffer=hostary)
+        else: # else hostary points to ary and how has the right memory
+            hostary = ary
+
+        return hostary
+
+    def as_hsa_arg(self):
+        """Returns a device memory object that is used as the argument.
+        """
+        return self.dgpu_data
+
+
+class DeviceNDArray(DeviceNDArrayBase):
+    '''
+    An on-dGPU array type
+    '''
+    def is_f_contiguous(self):
+        '''
+        Return true if the array is Fortran-contiguous.
+        '''
+        return self._dummy.is_f_contig
+
+    def is_c_contiguous(self):
+        '''
+        Return true if the array is C-contiguous.
+        '''
+        return self._dummy.is_c_contig
+
+    def reshape(self, *newshape, **kws):
+        """
+        Reshape the array without changing its contents, similarly to
+        :meth:`numpy.ndarray.reshape`. Example::
+
+            d_arr = d_arr.reshape(20, 50, order='F')
+        """
+        if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
+            newshape = newshape[0]
+
+        cls = type(self)
+        if newshape == self.shape:
+            # nothing to do
+            return cls(shape=self.shape, strides=self.strides,
+                       dtype=self.dtype, dgpu_data=self.dgpu_data)
+
+        newarr, extents = self._dummy.reshape(*newshape, **kws)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, dgpu_data=self.dgpu_data)
+        else:
+            raise NotImplementedError("operation requires copying")
+
+    def ravel(self, order='C'):
+        '''
+        Flatten the array without changing its contents, similar to
+        :meth:`numpy.ndarray.ravel`.
+        '''
+        cls = type(self)
+        newarr, extents = self._dummy.ravel(order=order)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, dgpu_data=self.dgpu_data)
+
+        else:
+            raise NotImplementedError("operation requires copying")
+
+
+class HostArray(np.ndarray):
+    __hsa_memory__ = True
+
+    @property
+    def device_ctypes_pointer(self):
+        return self.ctypes.data_as(c_void_p)
+
+
+def from_array_like(ary, dgpu_data=None):
+    "Create a DeviceNDArray object that is like ary."
+    if ary.ndim == 0:
+        ary = ary.reshape(1)
+    return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
+                         dgpu_data=dgpu_data)
+
+
+
+errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
+                            "be transferred as a single memory region. Please "
+                            "ensure contiguous buffer with numpy "
+                            ".ascontiguousarray()")
+
+
+def _single_buffer(ary):
+    i = np.argmax(ary.strides)
+    size = ary.strides[i] * ary.shape[i]
+    return size == ary.nbytes
+
+
+def sentry_contiguous(ary):
+    if not ary.flags['C_CONTIGUOUS'] and not ary.flags['F_CONTIGUOUS']:
+        if ary.strides[0] == 0:
+            # Broadcasted, ensure inner contiguous
+            return sentry_contiguous(ary[0])
+
+        elif _single_buffer(ary):
+            return True
+
+        else:
+            raise ValueError(errmsg_contiguous_buffer)
+
+
+def auto_device(obj, context, stream=None, copy=True):
+    """
+    Create a DeviceArray like obj and optionally copy data from
+    host to device. If obj already represents device memory, it is returned and
+    no copy is made.
+    """
+    if _driver.is_device_memory(obj): # it's already on the dGPU
+        return obj, False
+    else: # needs to be copied to the dGPU
+        sentry_contiguous(obj)
+        devobj = from_array_like(obj)
+        if copy:
+            devobj.copy_to_device(obj, stream=stream, context=context)
+        return devobj, True
+
+
--- a/numba/roc/hsadrv/devices.py
+++ b/numba/roc/hsadrv/devices.py
+"""
+Expose each GPU device directly
+"""
+import functools
+from .driver import hsa as driver, Context as _Context
+from numba.roc import servicelib
+
+
+class _culist(object):
+    """A thread local list of GPU instances
+    """
+
+    def __init__(self):
+        self._lst = None
+
+    @property
+    def _gpus(self):
+        if not self._lst:
+            self._lst = self._init_gpus()
+        return self._lst
+
+    def _init_gpus(self):
+        gpus = []
+        for com in driver.components:
+            gpus.append(CU(com))
+        return gpus
+
+    def __getitem__(self, item):
+        return self._gpus[item]
+
+    def append(self, item):
+        return self._gpus.append(item)
+
+    def __len__(self):
+        return len(self._gpus)
+
+    def __nonzero__(self):
+        return bool(self._gpus)
+
+    def __iter__(self):
+        return iter(self._gpus)
+
+    __bool__ = __nonzero__
+
+    def reset(self):
+        for gpu in self:
+            gpu.reset()
+
+    @property
+    def current(self):
+        """Get the current GPU object associated with the thread
+        """
+        return _custack.top
+
+
+cus = _culist()
+del _culist
+
+
+class CU(object):
+    def __init__(self, cu):
+        self._cu = cu
+        self._context = None
+
+    def __getattr__(self, key):
+        """Redirect to self._gpu
+        """
+        if key.startswith('_'):
+            raise AttributeError(key)
+        return getattr(self._cu, key)
+
+    def __repr__(self):
+        return repr(self._cu)
+
+    def associate_context(self):
+        """Associate the context of this GPU to the running thread
+        """
+        # No context was created for this GPU
+        if self._context is None:
+            self._context = self._cu.create_context()
+
+        return self._context
+
+    def __enter__(self):
+        self.associate_context()
+        _custack.push(self)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert _get_device() is self
+        self._context.pop()
+        _custack.pop()
+
+    def reset(self):
+        if self._context:
+            self._context.reset()
+            self._context = None
+
+
+_cpu_context = None
+
+
+def get_cpu_context():
+    global _cpu_context
+    if _cpu_context is None:
+        cpu_agent = [a for a in driver.agents if not a.is_component][0]
+        _cpu_context = _Context(cpu_agent)
+    return _cpu_context
+
+
+def get_gpu(i):
+    return cus[i]
+
+def get_num_gpus():
+    return len(cus)
+
+
+_custack = servicelib.TLStack()
+
+
+def _get_device(devnum=0):
+    """Get the current device or use a device by device number.
+    """
+    if not _custack:
+        _custack.push(get_gpu(devnum))
+    return _custack.top
+
+
+def get_context(devnum=0):
+    """Get the current device or use a device by device number, and
+    return the HSA context.
+    """
+    return _get_device(devnum=devnum).associate_context()
+
+
+def get_all_contexts():
+    return [get_context(i) for i in range(get_num_gpus())]
+
+
+def require_context(fn):
+    """
+    A decorator to ensure a context for the HSA subsystem
+    """
+
+    @functools.wraps(fn)
+    def _require_cu_context(*args, **kws):
+        get_context()
+        return fn(*args, **kws)
+
+    return _require_cu_context
+
+
+def reset():
+    cus.reset()
+    _custack.clear()
+
+
--- a/numba/roc/hsadrv/driver.py
+++ b/numba/roc/hsadrv/driver.py
--- a/numba/roc/hsadrv/drvapi.py
+++ b/numba/roc/hsadrv/drvapi.py
--- a/numba/roc/hsadrv/enums.py
+++ b/numba/roc/hsadrv/enums.py
--- a/numba/roc/hsadrv/enums_ext.py
+++ b/numba/roc/hsadrv/enums_ext.py
+"""Enum values for HSA from the HSA extension header
+
+Note that Python namespacing could be used to avoid the C-like
+prefixing, but we choose to keep the same names as found in the C
+enums, in order to match the documentation.
+"""
+
+# These enums are a direct translation of those found in:
+# hsa_ext_amd.h from the ROCR-Runtime. For example:
+# https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/hsa_ext_amd.h
+# Comments relating to the values are largely wholesale copied.
+
+import ctypes
+
+
+#------------------------------------------------------------------------------
+#
+# Anonymous enum expressing that a memory pool is invalid
+#
+HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Agent attributes
+#
+# Enums of the type hsa_amd_agent_info_t
+
+# Chip identifier. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000
+
+# Size of a cacheline in bytes. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001
+
+# The number of compute unit available in the agent. The type of this
+# attribute is uint32_t.
+HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002
+
+# The maximum clock frequency of the agent in MHz. The type of this
+# attribute is uint32_t.
+HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003
+
+# Internay driver node identifier. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004
+
+# Max number of watch points on memory address ranges to generate exception
+# events when the watched addresses are accessed.
+HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Region attributes
+#
+# Enums of the type hsa_amd_region_info_t
+
+# Determine if host can access the region. The type of this attribute is bool.
+HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000
+
+# Base address of the region in flat address space.
+HSA_AMD_REGION_INFO_BASE = 0xA001
+
+# Memory Interface width, the return value type is uint32_t.
+# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
+HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002
+
+# Max Memory Clock, the return value type is uint32_t.
+# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY.
+HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Coherency attributes of a fine grained region
+#
+# Enums of the type hsa_amd_coherency_type_t
+
+# Coherent region.
+HSA_AMD_COHERENCY_TYPE_COHERENT = 0
+
+# Non coherent region.
+HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Memory segments associated with a memory pool.
+#
+# Enums of the type hsa_amd_segment_t
+
+# Global segment. Used to hold data that is shared by all agents.
+HSA_AMD_SEGMENT_GLOBAL = 0
+
+# Read-only segment. Used to hold data that remains constant during the
+# execution of a kernel.
+HSA_AMD_SEGMENT_READONLY = 1
+
+# Private segment. Used to hold data that is local to a single work-item.
+HSA_AMD_SEGMENT_PRIVATE = 2
+
+# Group segment. Used to hold data that is shared by the work-items of a
+# work-group.
+HSA_AMD_SEGMENT_GROUP = 3
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Memory pool global flags.
+#
+# Enums of the type hsa_amd_memory_pool_global_flag_t.
+
+# The application can use allocations in the memory pool to store kernel
+# arguments, and provide the values for the kernarg segment of
+# a kernel dispatch.
+HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1
+
+# Updates to memory in this pool conform to HSA memory consistency model.
+# If this flag is set, then HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
+# must not be set.
+HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2
+
+# Writes to memory in this pool can be performed by a single agent at a time.
+HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Memory pool features flags.
+#
+# Enums of the type hsa_amd_memory_pool_info_t.
+
+# Segment where the memory pool resides. The type of this attribute is
+# hsa_amd_segment_t.
+HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0
+
+# Flag mask. The value of this attribute is undefined if the value of
+# HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not HSA_AMD_SEGMENT_GLOBAL. The type
+# of this attribute is uint32_t, a bit-field of
+# hsa_amd_memory_pool_global_flag_t values.
+HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1
+
+# Size of this pool, in bytes. The type of this attribute is size_t.
+HSA_AMD_MEMORY_POOL_INFO_SIZE = 2
+
+# Indicates whether memory in this pool can be allocated using
+# hsa_amd_memory_pool_allocate. The type of this attribute is bool.
+# The value of this flag is always false for memory pools in the group and
+# private segments.
+HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5
+
+# Allocation granularity of buffers allocated by hsa_amd_memory_pool_allocate
+# in this memory pool. The size of a buffer allocated in this pool is a
+# multiple of the value of this attribute. The value of this attribute is
+# only defined if HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for
+# this pool. The type of this attribute is size_t.
+HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6
+
+# Alignment of buffers allocated by hsa_amd_memory_pool_allocate in this
+# pool. The value of this attribute is only defined if
+# HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and
+# must be a power of 2. The type of this attribute is size_t.
+HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
+
+# This memory_pool can be made directly accessible by all the agents in the
+# system (hsa_amd_agent_memory_pool_get_info returns
+# HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT for all agents). The type of
+# this attribute is bool.
+HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Type of accesses to a memory pool from a given agent.
+#
+# Enums of the type hsa_amd_memory_pool_access_t
+
+# The agent cannot directly access any buffer in the memory pool.
+HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0
+
+# The agent can directly access a buffer located in the pool; the application
+# does not need to invoke hsa_amd_agents_allow_access.
+HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1
+
+# The agent can directly access a buffer located in the pool, but only if the
+# application has previously requested access to that buffer using
+# hsa_amd_agents_allow_access.
+HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Properties of the relationship between an agent a memory pool.
+#
+# Enums of the type hsa_amd_link_info_type_t
+
+# Hyper-transport bus type.
+HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0
+
+# QPI bus type.
+HSA_AMD_LINK_INFO_TYPE_QPI = 1
+
+# PCIe bus type.
+HSA_AMD_LINK_INFO_TYPE_PCIE = 2
+
+# Infiniband bus type.
+HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Access to buffers located in the memory pool. The type of this attribute
+# is hsa_amd_memory_pool_access_t.
+#
+# Enums of type hsa_amd_agent_memory_pool_info_t.
+
+# An agent can always directly access buffers currently located in a memory
+# pool that is associated (the memory_pool is one of the values returned by
+# hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the
+# buffer is currently located in a memory pool that is not associated with
+# the agent, and the value returned by this function for the given
+# combination of agent and memory pool is not
+# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to
+# invoke hsa_amd_agents_allow_access in order to gain direct access to the
+# buffer.
+
+# If the given agent can directly access buffers the pool, the result is not
+# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated
+# with the agent, or it is of fined-grained type, the result must not be
+# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not
+# associated with the agent, and does not reside in the global segment, the
+# result must be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED.
+HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0
+
+# Number of links to hop when accessing the memory pool from the specified
+# agent. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1
+
+# Details of each link hop when accessing the memory pool starting from the
+# specified agent. The type of this attribute is an array size of
+# HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing
+# hsa_amd_memory_pool_link_info_t.
+HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2
+#------------------------------------------------------------------------------
+
+
--- a/numba/roc/hsadrv/error.py
+++ b/numba/roc/hsadrv/error.py
--- a/numba/roc/hsaimpl.py
+++ b/numba/roc/hsaimpl.py
--- a/numba/roc/initialize.py
+++ b/numba/roc/initialize.py
--- a/numba/roc/mathdecl.py
+++ b/numba/roc/mathdecl.py
--- a/numba/roc/mathimpl.py
+++ b/numba/roc/mathimpl.py
--- a/numba/roc/servicelib/__init__.py
+++ b/numba/roc/servicelib/__init__.py
+from .service import Service
+from .threadlocal import TLStack
--- a/numba/roc/servicelib/service.py
+++ b/numba/roc/servicelib/service.py
--- a/numba/roc/servicelib/threadlocal.py
+++ b/numba/roc/servicelib/threadlocal.py
--- a/numba/roc/stubs.py
+++ b/numba/roc/stubs.py
--- a/numba/roc/target.py
+++ b/numba/roc/target.py
--- a/numba/roc/tests/__init__.py
+++ b/numba/roc/tests/__init__.py
--- a/numba/roc/tests/hsadrv/__init__.py
+++ b/numba/roc/tests/hsadrv/__init__.py
--- a/numba/roc/tests/hsadrv/test_async.py
+++ b/numba/roc/tests/hsadrv/test_async.py
--- a/numba/roc/tests/hsadrv/test_driver.py
+++ b/numba/roc/tests/hsadrv/test_driver.py